diff options
Diffstat (limited to 'contrib/llvm/lib/CodeGen')
219 files changed, 24301 insertions, 11003 deletions
diff --git a/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp index bb90861..5abf50e 100644 --- a/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp +++ b/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp @@ -128,8 +128,7 @@ AggressiveAntiDepBreaker::AggressiveAntiDepBreaker( } DEBUG(dbgs() << "AntiDep Critical-Path Registers:"); - DEBUG(for (int r = CriticalPathSet.find_first(); r != -1; - r = CriticalPathSet.find_next(r)) + DEBUG(for (unsigned r : CriticalPathSet.set_bits()) dbgs() << " " << TRI->getName(r)); DEBUG(dbgs() << '\n'); } @@ -163,9 +162,11 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) { // callee-saved register that is not saved in the prolog. const MachineFrameInfo &MFI = MF.getFrameInfo(); BitVector Pristine = MFI.getPristineRegs(MF); - for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) { + for (const MCPhysReg *I = MF.getRegInfo().getCalleeSavedRegs(); *I; + ++I) { unsigned Reg = *I; - if (!IsReturnBlock && !Pristine.test(Reg)) continue; + if (!IsReturnBlock && !Pristine.test(Reg)) + continue; for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) { unsigned AliasReg = *AI; State->UnionGroups(AliasReg, 0); @@ -569,7 +570,7 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters( DEBUG({ dbgs() << " ::"; - for (int r = BV.find_first(); r != -1; r = BV.find_next(r)) + for (unsigned r : BV.set_bits()) dbgs() << " " << TRI->getName(r); dbgs() << "\n"; }); @@ -962,10 +963,8 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies( // sure to update that as well. const SUnit *SU = MISUnitMap[Q.second.Operand->getParent()]; if (!SU) continue; - for (DbgValueVector::iterator DVI = DbgValues.begin(), - DVE = DbgValues.end(); DVI != DVE; ++DVI) - if (DVI->second == Q.second.Operand->getParent()) - UpdateDbgValue(*DVI->first, AntiDepReg, NewReg); + UpdateDbgValues(DbgValues, Q.second.Operand->getParent(), + AntiDepReg, NewReg); } // We just went back in time and modified history; the diff --git a/contrib/llvm/lib/CodeGen/Analysis.cpp b/contrib/llvm/lib/CodeGen/Analysis.cpp index 79ecc43..c2aecc6 100644 --- a/contrib/llvm/lib/CodeGen/Analysis.cpp +++ b/contrib/llvm/lib/CodeGen/Analysis.cpp @@ -24,8 +24,8 @@ #include "llvm/IR/Module.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetSubtargetInfo.h" #include "llvm/Transforms/Utils/GlobalStatus.h" @@ -516,10 +516,9 @@ bool llvm::attributesPermitTailCall(const Function *F, const Instruction *I, bool &ADS = AllowDifferingSizes ? *AllowDifferingSizes : DummyADS; ADS = true; - AttrBuilder CallerAttrs(F->getAttributes(), - AttributeSet::ReturnIndex); + AttrBuilder CallerAttrs(F->getAttributes(), AttributeList::ReturnIndex); AttrBuilder CalleeAttrs(cast<CallInst>(I)->getAttributes(), - AttributeSet::ReturnIndex); + AttributeList::ReturnIndex); // Noalias is completely benign as far as calling convention goes, it // shouldn't affect whether the call is a tail call. @@ -613,25 +612,6 @@ bool llvm::returnTypeIsEligibleForTailCall(const Function *F, return true; } -bool llvm::canBeOmittedFromSymbolTable(const GlobalValue *GV) { - if (!GV->hasLinkOnceODRLinkage()) - return false; - - // We assume that anyone who sets global unnamed_addr on a non-constant knows - // what they're doing. - if (GV->hasGlobalUnnamedAddr()) - return true; - - // If it is a non constant variable, it needs to be uniqued across shared - // objects. - if (const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV)) { - if (!Var->isConstant()) - return false; - } - - return GV->hasAtLeastLocalUnnamedAddr(); -} - static void collectFuncletMembers( DenseMap<const MachineBasicBlock *, int> &FuncletMembership, int Funclet, const MachineBasicBlock *MBB) { diff --git a/contrib/llvm/lib/CodeGen/AntiDepBreaker.h b/contrib/llvm/lib/CodeGen/AntiDepBreaker.h index 04f7f41..d14d931 100644 --- a/contrib/llvm/lib/CodeGen/AntiDepBreaker.h +++ b/contrib/llvm/lib/CodeGen/AntiDepBreaker.h @@ -60,6 +60,25 @@ public: if (MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == OldReg) MI.getOperand(0).setReg(NewReg); } + + /// Update all DBG_VALUE instructions that may be affected by the dependency + /// breaker's update of ParentMI to use NewReg. + void UpdateDbgValues(const DbgValueVector &DbgValues, MachineInstr *ParentMI, + unsigned OldReg, unsigned NewReg) { + // The following code is dependent on the order in which the DbgValues are + // constructed in ScheduleDAGInstrs::buildSchedGraph. + MachineInstr *PrevDbgMI = nullptr; + for (const auto &DV : make_range(DbgValues.crbegin(), DbgValues.crend())) { + MachineInstr *PrevMI = DV.second; + if ((PrevMI == ParentMI) || (PrevMI == PrevDbgMI)) { + MachineInstr *DbgMI = DV.first; + UpdateDbgValue(*DbgMI, OldReg, NewReg); + PrevDbgMI = DbgMI; + } else if (PrevDbgMI) { + break; // If no match and already found a DBG_VALUE, we're done. + } + } + } }; } diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp index 61149d9..8b1376a 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp @@ -14,6 +14,7 @@ #include "DwarfException.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Twine.h" +#include "llvm/BinaryFormat/Dwarf.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -27,7 +28,6 @@ #include "llvm/MC/MCSection.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Support/Dwarf.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetOptions.h" diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 24fdbfc..ff427c9 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -12,47 +12,101 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/AsmPrinter.h" +#include "AsmPrinterHandler.h" #include "CodeViewDebug.h" #include "DwarfDebug.h" #include "DwarfException.h" #include "WinException.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/ADT/Twine.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/ObjectUtils.h" +#include "llvm/BinaryFormat/Dwarf.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/GCMetadata.h" #include "llvm/CodeGen/GCMetadataPrinter.h" +#include "llvm/CodeGen/GCStrategy.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBundle.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalIFunc.h" +#include "llvm/IR/GlobalIndirectSymbol.h" +#include "llvm/IR/GlobalObject.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Mangler.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/Value.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDirectives.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSymbolELF.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/MCValue.h" +#include "llvm/MC/SectionKind.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/Timer.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cinttypes> +#include <cstdint> +#include <limits> +#include <memory> +#include <string> +#include <utility> +#include <vector> + using namespace llvm; #define DEBUG_TYPE "asm-printer" @@ -69,6 +123,10 @@ static const char *const CodeViewLineTablesGroupDescription = STATISTIC(EmittedInsts, "Number of machine instrs printed"); +static cl::opt<bool> + PrintSchedule("print-schedule", cl::Hidden, cl::init(false), + cl::desc("Print 'sched: [latency:throughput]' in .s output")); + char AsmPrinter::ID = 0; typedef DenseMap<GCStrategy*, std::unique_ptr<GCMetadataPrinter>> gcp_map_type; @@ -78,7 +136,6 @@ static gcp_map_type &getGCMap(void *&P) { return *(gcp_map_type*)P; } - /// getGVAlignmentLog2 - Return the alignment to use for the specified global /// value in log2 form. This rounds up to the preferred alignment if possible /// and legal. @@ -107,16 +164,7 @@ static unsigned getGVAlignmentLog2(const GlobalValue *GV, const DataLayout &DL, AsmPrinter::AsmPrinter(TargetMachine &tm, std::unique_ptr<MCStreamer> Streamer) : MachineFunctionPass(ID), TM(tm), MAI(tm.getMCAsmInfo()), - OutContext(Streamer->getContext()), OutStreamer(std::move(Streamer)), - isCFIMoveForDebugging(false), LastMI(nullptr), LastFn(0), Counter(~0U) { - DD = nullptr; - MMI = nullptr; - LI = nullptr; - MF = nullptr; - CurExceptionSym = CurrentFnSym = CurrentFnSymForSize = nullptr; - CurrentFnBegin = nullptr; - CurrentFnEnd = nullptr; - GCMetadataPrinters = nullptr; + OutContext(Streamer->getContext()), OutStreamer(std::move(Streamer)) { VerboseAsm = OutStreamer->isVerboseAsm(); } @@ -171,6 +219,7 @@ void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); AU.addRequired<MachineModuleInfo>(); + AU.addRequired<MachineOptimizationRemarkEmitterPass>(); AU.addRequired<GCModuleInfo>(); if (isVerbose()) AU.addRequired<MachineLoopInfo>(); @@ -223,7 +272,7 @@ bool AsmPrinter::doInitialization(Module &M) { // don't, this at least helps the user find where a global came from. if (MAI->hasSingleParameterDotFile()) { // .file "foo.c" - OutStreamer->EmitFileDirective(M.getModuleIdentifier()); + OutStreamer->EmitFileDirective(M.getSourceFileName()); } GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>(); @@ -571,7 +620,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { /// /// \p Value - The value to emit. /// \p Size - The size of the integer (in bytes) to emit. -void AsmPrinter::EmitDebugValue(const MCExpr *Value, +void AsmPrinter::EmitDebugThreadLocal(const MCExpr *Value, unsigned Size) const { OutStreamer->EmitValue(Value, Size); } @@ -579,12 +628,17 @@ void AsmPrinter::EmitDebugValue(const MCExpr *Value, /// EmitFunctionHeader - This method emits the header for the current /// function. void AsmPrinter::EmitFunctionHeader() { + const Function *F = MF->getFunction(); + + if (isVerbose()) + OutStreamer->GetCommentOS() + << "-- Begin function " + << GlobalValue::dropLLVMManglingEscape(F->getName()) << '\n'; + // Print out constants referenced by the function EmitConstantPool(); // Print the 'header' of function. - const Function *F = MF->getFunction(); - OutStreamer->SwitchSection(getObjFileLowering().SectionForGlobal(F, TM)); EmitVisibility(CurrentFnSym, F->getVisibility()); @@ -602,8 +656,23 @@ void AsmPrinter::EmitFunctionHeader() { } // Emit the prefix data. - if (F->hasPrefixData()) - EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData()); + if (F->hasPrefixData()) { + if (MAI->hasSubsectionsViaSymbols()) { + // Preserving prefix data on platforms which use subsections-via-symbols + // is a bit tricky. Here we introduce a symbol for the prefix data + // and use the .alt_entry attribute to mark the function's real entry point + // as an alternative entry point to the prefix-data symbol. + MCSymbol *PrefixSym = OutContext.createLinkerPrivateTempSymbol(); + OutStreamer->EmitLabel(PrefixSym); + + EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData()); + + // Emit an .alt_entry directive for the actual function symbol. + OutStreamer->EmitSymbolAttribute(CurrentFnSym, MCSA_AltEntry); + } else { + EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData()); + } + } // Emit the CurrentFnSym. This is a virtual function to allow targets to // do their wild and crazy things as required. @@ -660,7 +729,8 @@ void AsmPrinter::EmitFunctionEntryLabel() { } /// emitComments - Pretty-print comments for instructions. -static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) { +static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS, + AsmPrinter *AP) { const MachineFunction *MF = MI.getParent()->getParent(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); @@ -668,6 +738,7 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) { int FI; const MachineFrameInfo &MFI = MF->getFrameInfo(); + bool Commented = false; // We assume a single instruction only has a spill or reload, not // both. @@ -675,24 +746,39 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) { if (TII->isLoadFromStackSlotPostFE(MI, FI)) { if (MFI.isSpillSlotObjectIndex(FI)) { MMO = *MI.memoperands_begin(); - CommentOS << MMO->getSize() << "-byte Reload\n"; + CommentOS << MMO->getSize() << "-byte Reload"; + Commented = true; } } else if (TII->hasLoadFromStackSlot(MI, MMO, FI)) { - if (MFI.isSpillSlotObjectIndex(FI)) - CommentOS << MMO->getSize() << "-byte Folded Reload\n"; + if (MFI.isSpillSlotObjectIndex(FI)) { + CommentOS << MMO->getSize() << "-byte Folded Reload"; + Commented = true; + } } else if (TII->isStoreToStackSlotPostFE(MI, FI)) { if (MFI.isSpillSlotObjectIndex(FI)) { MMO = *MI.memoperands_begin(); - CommentOS << MMO->getSize() << "-byte Spill\n"; + CommentOS << MMO->getSize() << "-byte Spill"; + Commented = true; } } else if (TII->hasStoreToStackSlot(MI, MMO, FI)) { - if (MFI.isSpillSlotObjectIndex(FI)) - CommentOS << MMO->getSize() << "-byte Folded Spill\n"; + if (MFI.isSpillSlotObjectIndex(FI)) { + CommentOS << MMO->getSize() << "-byte Folded Spill"; + Commented = true; + } } // Check for spill-induced copies - if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse)) - CommentOS << " Reload Reuse\n"; + if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse)) { + Commented = true; + CommentOS << " Reload Reuse"; + } + + if (Commented && AP->EnablePrintSchedInfo) + // If any comment was added above and we need sched info comment then + // add this new comment just after the above comment w/o "\n" between them. + CommentOS << " " << MF->getSubtarget().getSchedInfoStr(MI) << "\n"; + else if (Commented) + CommentOS << "\n"; } /// emitImplicitDef - This method emits the specified machine instruction @@ -739,46 +825,30 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) { const DILocalVariable *V = MI->getDebugVariable(); if (auto *SP = dyn_cast<DISubprogram>(V->getScope())) { - StringRef Name = SP->getDisplayName(); + StringRef Name = SP->getName(); if (!Name.empty()) OS << Name << ":"; } OS << V->getName(); - - const DIExpression *Expr = MI->getDebugExpression(); - auto Fragment = Expr->getFragmentInfo(); - if (Fragment) - OS << " [fragment offset=" << Fragment->OffsetInBits - << " size=" << Fragment->SizeInBits << "]"; OS << " <- "; // The second operand is only an offset if it's an immediate. - bool Deref = MI->getOperand(0).isReg() && MI->getOperand(1).isImm(); - int64_t Offset = Deref ? MI->getOperand(1).getImm() : 0; - - for (unsigned i = 0; i < Expr->getNumElements(); ++i) { - uint64_t Op = Expr->getElement(i); - if (Op == dwarf::DW_OP_LLVM_fragment) { - // There can't be any operands after this in a valid expression - break; - } else if (Deref) { - // We currently don't support extra Offsets or derefs after the first - // one. Bail out early instead of emitting an incorrect comment - OS << " [complex expression]"; - AP.OutStreamer->emitRawComment(OS.str()); - return true; - } else if (Op == dwarf::DW_OP_deref) { - Deref = true; - continue; - } - - uint64_t ExtraOffset = Expr->getElement(i++); - if (Op == dwarf::DW_OP_plus) - Offset += ExtraOffset; - else { - assert(Op == dwarf::DW_OP_minus); - Offset -= ExtraOffset; + bool MemLoc = MI->getOperand(0).isReg() && MI->getOperand(1).isImm(); + int64_t Offset = MemLoc ? MI->getOperand(1).getImm() : 0; + const DIExpression *Expr = MI->getDebugExpression(); + if (Expr->getNumElements()) { + OS << '['; + bool NeedSep = false; + for (auto Op : Expr->expr_ops()) { + if (NeedSep) + OS << ", "; + else + NeedSep = true; + OS << dwarf::OperationEncodingString(Op.getOp()); + for (unsigned I = 0; I < Op.getNumArgs(); ++I) + OS << ' ' << Op.getArg(I); } + OS << "] "; } // Register or immediate value. Register 0 means undef. @@ -809,7 +879,7 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) { const TargetFrameLowering *TFI = AP.MF->getSubtarget().getFrameLowering(); Offset += TFI->getFrameIndexReference(*AP.MF, MI->getOperand(0).getIndex(), Reg); - Deref = true; + MemLoc = true; } if (Reg == 0) { // Suppress offset, it is not meaningful here. @@ -818,12 +888,12 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) { AP.OutStreamer->emitRawComment(OS.str()); return true; } - if (Deref) + if (MemLoc) OS << '['; OS << PrintReg(Reg, AP.MF->getSubtarget().getRegisterInfo()); } - if (Deref) + if (MemLoc) OS << '+' << Offset << ']'; // NOTE: Want this comment at start of line, don't emit with AddComment. @@ -855,6 +925,16 @@ void AsmPrinter::emitCFIInstruction(const MachineInstr &MI) { if (needsCFIMoves() == CFI_M_None) return; + // If there is no "real" instruction following this CFI instruction, skip + // emitting it; it would be beyond the end of the function's FDE range. + auto *MBB = MI.getParent(); + auto I = std::next(MI.getIterator()); + while (I != MBB->end() && I->isTransient()) + ++I; + if (I == MBB->instr_end() && + MBB->getReverseIterator() == MBB->getParent()->rbegin()) + return; + const std::vector<MCCFIInstruction> &Instrs = MF->getFrameInstructions(); unsigned CFIIndex = MI.getOperand(0).getCFIIndex(); const MCCFIInstruction &CFI = Instrs[CFIIndex]; @@ -871,6 +951,19 @@ void AsmPrinter::emitFrameAlloc(const MachineInstr &MI) { MCConstantExpr::create(FrameOffset, OutContext)); } +static bool needFuncLabelsForEHOrDebugInfo(const MachineFunction &MF, + MachineModuleInfo *MMI) { + if (!MF.getLandingPads().empty() || MF.hasEHFunclets() || MMI->hasDebugInfo()) + return true; + + // We might emit an EH table that uses function begin and end labels even if + // we don't have any landingpads. + if (!MF.getFunction()->hasPersonalityFn()) + return false; + return !isNoOpWithoutInvoke( + classifyEHPersonality(MF.getFunction()->getPersonalityFn())); +} + /// EmitFunctionBody - This method emits the body and trailer for a /// function. void AsmPrinter::EmitFunctionBody() { @@ -883,6 +976,7 @@ void AsmPrinter::EmitFunctionBody() { // Print out code for the function. bool HasAnyRealCode = false; + int NumInstsInFunction = 0; for (auto &MBB : *MF) { // Print a label for the basic block. EmitBasicBlockStart(MBB); @@ -892,7 +986,7 @@ void AsmPrinter::EmitFunctionBody() { if (!MI.isPosition() && !MI.isImplicitDef() && !MI.isKill() && !MI.isDebugValue()) { HasAnyRealCode = true; - ++EmittedInsts; + ++NumInstsInFunction; } if (ShouldPrintDebugScopes) { @@ -905,7 +999,7 @@ void AsmPrinter::EmitFunctionBody() { } if (isVerbose()) - emitComments(MI, OutStreamer->GetCommentOS()); + emitComments(MI, OutStreamer->GetCommentOS(), this); switch (MI.getOpcode()) { case TargetOpcode::CFI_INSTRUCTION: @@ -953,18 +1047,34 @@ void AsmPrinter::EmitFunctionBody() { EmitBasicBlockEnd(MBB); } + EmittedInsts += NumInstsInFunction; + MachineOptimizationRemarkAnalysis R(DEBUG_TYPE, "InstructionCount", + MF->getFunction()->getSubprogram(), + &MF->front()); + R << ore::NV("NumInstructions", NumInstsInFunction) + << " instructions in function"; + ORE->emit(R); + // If the function is empty and the object file uses .subsections_via_symbols, // then we need to emit *something* to the function body to prevent the // labels from collapsing together. Just emit a noop. - if ((MAI->hasSubsectionsViaSymbols() && !HasAnyRealCode)) { + // Similarly, don't emit empty functions on Windows either. It can lead to + // duplicate entries (two functions with the same RVA) in the Guard CF Table + // after linking, causing the kernel not to load the binary: + // https://developercommunity.visualstudio.com/content/problem/45366/vc-linker-creates-invalid-dll-with-clang-cl.html + // FIXME: Hide this behind some API in e.g. MCAsmInfo or MCTargetStreamer. + const Triple &TT = TM.getTargetTriple(); + if (!HasAnyRealCode && (MAI->hasSubsectionsViaSymbols() || + (TT.isOSWindows() && TT.isOSBinFormatCOFF()))) { MCInst Noop; - MF->getSubtarget().getInstrInfo()->getNoopForMachoTarget(Noop); - OutStreamer->AddComment("avoids zero-length function"); + MF->getSubtarget().getInstrInfo()->getNoop(Noop); // Targets can opt-out of emitting the noop here by leaving the opcode // unspecified. - if (Noop.getOpcode()) + if (Noop.getOpcode()) { + OutStreamer->AddComment("avoids zero-length function"); OutStreamer->EmitInstruction(Noop, getSubtargetInfo()); + } } const Function *F = MF->getFunction(); @@ -981,8 +1091,8 @@ void AsmPrinter::EmitFunctionBody() { // Emit target-specific gunk after the function body. EmitFunctionBodyEnd(); - if (!MF->getLandingPads().empty() || MMI->hasDebugInfo() || - MF->hasEHFunclets() || MAI->hasDotTypeDotSizeDirective()) { + if (needFuncLabelsForEHOrDebugInfo(*MF, MMI) || + MAI->hasDotTypeDotSizeDirective()) { // Create a symbol for the end of function. CurrentFnEnd = createTempSymbol("func_end"); OutStreamer->EmitLabel(CurrentFnEnd); @@ -1015,6 +1125,9 @@ void AsmPrinter::EmitFunctionBody() { HI.Handler->endFunction(MF); } + if (isVerbose()) + OutStreamer->GetCommentOS() << "-- End function\n"; + OutStreamer->AddBlankLine(); } @@ -1175,11 +1288,7 @@ bool AsmPrinter::doFinalization(Module &M) { const TargetLoweringObjectFile &TLOF = getObjFileLowering(); - // Emit module flags. - SmallVector<Module::ModuleFlagEntry, 8> ModuleFlags; - M.getModuleFlagsMetadata(ModuleFlags); - if (!ModuleFlags.empty()) - TLOF.emitModuleFlags(*OutStreamer, ModuleFlags, TM); + TLOF.emitModuleMetadata(*OutStreamer, M, TM); if (TM.getTargetTriple().isOSBinFormatELF()) { MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>(); @@ -1238,7 +1347,7 @@ bool AsmPrinter::doFinalization(Module &M) { break; AliasStack.push_back(Cur); } - for (const GlobalAlias *AncestorAlias : reverse(AliasStack)) + for (const GlobalAlias *AncestorAlias : llvm::reverse(AliasStack)) emitGlobalIndirectSymbol(M, *AncestorAlias); AliasStack.clear(); } @@ -1266,7 +1375,7 @@ bool AsmPrinter::doFinalization(Module &M) { OutContext.getOrCreateSymbol(StringRef("__morestack_addr")); OutStreamer->EmitLabel(AddrSymbol); - unsigned PtrSize = M.getDataLayout().getPointerSize(0); + unsigned PtrSize = MAI->getCodePointerSize(); OutStreamer->EmitSymbolValue(GetExternalSymbolSymbol("__morestack"), PtrSize); } @@ -1304,26 +1413,34 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) { CurrentFnBegin = nullptr; CurExceptionSym = nullptr; bool NeedsLocalForSize = MAI->needsLocalForSize(); - if (!MF.getLandingPads().empty() || MMI->hasDebugInfo() || - MF.hasEHFunclets() || NeedsLocalForSize) { + if (needFuncLabelsForEHOrDebugInfo(MF, MMI) || NeedsLocalForSize) { CurrentFnBegin = createTempSymbol("func_begin"); if (NeedsLocalForSize) CurrentFnSymForSize = CurrentFnBegin; } + ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE(); if (isVerbose()) LI = &getAnalysis<MachineLoopInfo>(); + + const TargetSubtargetInfo &STI = MF.getSubtarget(); + EnablePrintSchedInfo = PrintSchedule.getNumOccurrences() + ? PrintSchedule + : STI.supportPrintSchedInfo(); } namespace { + // Keep track the alignment, constpool entries per Section. struct SectionCPs { MCSection *S; unsigned Alignment; SmallVector<unsigned, 4> CPEs; + SectionCPs(MCSection *s, unsigned a) : S(s), Alignment(a) {} }; -} + +} // end anonymous namespace /// EmitConstantPool - Print to the current output stream assembly /// representations of the constants in the constant pool MCP. This is @@ -1547,7 +1664,6 @@ void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI, OutStreamer->EmitValue(Value, EntrySize); } - /// EmitSpecialLLVMGlobal - Check to see if the specified global is a /// special global used by LLVM. If so, emit it and return true, otherwise /// do nothing and return false. @@ -1598,13 +1714,16 @@ void AsmPrinter::EmitLLVMUsedList(const ConstantArray *InitList) { } namespace { + struct Structor { - Structor() : Priority(0), Func(nullptr), ComdatKey(nullptr) {} - int Priority; - llvm::Constant *Func; - llvm::GlobalValue *ComdatKey; + int Priority = 0; + Constant *Func = nullptr; + GlobalValue *ComdatKey = nullptr; + + Structor() = default; }; -} // end namespace + +} // end anonymous namespace /// EmitXXStructorList - Emit the ctor or dtor list taking into account the init /// priority. @@ -1653,8 +1772,11 @@ void AsmPrinter::EmitXXStructorList(const DataLayout &DL, const Constant *List, const TargetLoweringObjectFile &Obj = getObjFileLowering(); const MCSymbol *KeySym = nullptr; if (GlobalValue *GV = S.ComdatKey) { - if (GV->hasAvailableExternallyLinkage()) - // If the associated variable is available_externally, some other TU + if (GV->isDeclarationForLinker()) + // If the associated variable is not defined in this module + // (it might be available_externally, or have been an + // available_externally definition that was dropped by the + // EliminateAvailableExternally pass), some other TU // will provide its dynamic initializer. continue; @@ -1931,7 +2053,6 @@ static int isRepeatedByteSequence(const ConstantDataSequential *V) { return static_cast<uint8_t>(C); // Ensure 255 is not returned as -1. } - /// isRepeatedByteSequence - Determine whether the given value is /// composed of a repeated sequence of identical bytes and return the /// byte value. If it is not a repeated sequence, return -1. @@ -1972,7 +2093,6 @@ static int isRepeatedByteSequence(const Value *V, const DataLayout &DL) { static void emitGlobalConstantDataSequential(const DataLayout &DL, const ConstantDataSequential *CDS, AsmPrinter &AP) { - // See if we can aggregate this into a .fill, if so, emit it as such. int Value = isRepeatedByteSequence(CDS, DL); if (Value != -1) { @@ -2006,7 +2126,6 @@ static void emitGlobalConstantDataSequential(const DataLayout &DL, CDS->getNumElements(); if (unsigned Padding = Size - EmittedSize) AP.OutStreamer->EmitZeros(Padding); - } static void emitGlobalConstantArray(const DataLayout &DL, @@ -2145,7 +2264,7 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) { // chu[nk1 chu][nk2 chu] ... [nkN-1 chunkN] ExtraBits = Realigned.getRawData()[0] & (((uint64_t)-1) >> (64 - ExtraBitsSize)); - Realigned = Realigned.lshr(ExtraBitsSize); + Realigned.lshrInPlace(ExtraBitsSize); } else ExtraBits = Realigned.getRawData()[BitWidth / 64]; } @@ -2420,8 +2539,6 @@ MCSymbol *AsmPrinter::GetExternalSymbolSymbol(StringRef Sym) const { return OutContext.getOrCreateSymbol(NameStr); } - - /// PrintParentLoopComment - Print comments about parent loops of this one. static void PrintParentLoopComment(raw_ostream &OS, const MachineLoop *Loop, unsigned FunctionNumber) { @@ -2486,7 +2603,6 @@ static void emitBasicBlockLoopComments(const MachineBasicBlock &MBB, PrintChildLoopComment(OS, Loop, AP.getFunctionNumber()); } - /// EmitBasicBlockStart - This method prints the label for the specified /// MachineBasicBlock, an alignment (if present) and a comment describing /// it if appropriate. @@ -2607,8 +2723,6 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const { return true; } - - GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy &S) { if (!S.usesMetadata()) return nullptr; @@ -2639,7 +2753,7 @@ GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy &S) { } /// Pin vtable to this file. -AsmPrinterHandler::~AsmPrinterHandler() {} +AsmPrinterHandler::~AsmPrinterHandler() = default; void AsmPrinterHandler::markFunctionEnd() {} @@ -2663,37 +2777,61 @@ void AsmPrinter::emitXRayTable() { auto PrevSection = OutStreamer->getCurrentSectionOnly(); auto Fn = MF->getFunction(); - MCSection *Section = nullptr; + MCSection *InstMap = nullptr; + MCSection *FnSledIndex = nullptr; if (MF->getSubtarget().getTargetTriple().isOSBinFormatELF()) { if (Fn->hasComdat()) { - Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS, + InstMap = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_GROUP, 0, Fn->getComdat()->getName()); + FnSledIndex = OutContext.getELFSection("xray_fn_idx", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_GROUP, 0, + Fn->getComdat()->getName()); } else { - Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS, + InstMap = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); + FnSledIndex = OutContext.getELFSection("xray_fn_idx", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC); } } else if (MF->getSubtarget().getTargetTriple().isOSBinFormatMachO()) { - Section = OutContext.getMachOSection("__DATA", "xray_instr_map", 0, + InstMap = OutContext.getMachOSection("__DATA", "xray_instr_map", 0, SectionKind::getReadOnlyWithRel()); + FnSledIndex = OutContext.getMachOSection("__DATA", "xray_fn_idx", 0, + SectionKind::getReadOnlyWithRel()); } else { llvm_unreachable("Unsupported target"); } // Before we switch over, we force a reference to a label inside the - // xray_instr_map section. Since this function is always called just - // before the function's end, we assume that this is happening after - // the last return instruction. - - auto WordSizeBytes = TM.getPointerSize(); - MCSymbol *Tmp = OutContext.createTempSymbol("xray_synthetic_", true); + // xray_fn_idx sections. This makes sure that the xray_fn_idx section is kept + // live by the linker if the function is not garbage-collected. Since this + // function is always called just before the function's end, we assume that + // this is happening after the last return instruction. + auto WordSizeBytes = MAI->getCodePointerSize(); + MCSymbol *IdxRef = OutContext.createTempSymbol("xray_fn_idx_synth_", true); OutStreamer->EmitCodeAlignment(16); - OutStreamer->EmitSymbolValue(Tmp, WordSizeBytes, false); - OutStreamer->SwitchSection(Section); - OutStreamer->EmitLabel(Tmp); + OutStreamer->EmitSymbolValue(IdxRef, WordSizeBytes, false); + + // Now we switch to the instrumentation map section. Because this is done + // per-function, we are able to create an index entry that will represent the + // range of sleds associated with a function. + MCSymbol *SledsStart = OutContext.createTempSymbol("xray_sleds_start", true); + OutStreamer->SwitchSection(InstMap); + OutStreamer->EmitLabel(SledsStart); for (const auto &Sled : Sleds) Sled.emit(WordSizeBytes, OutStreamer.get(), CurrentFnSym); - + MCSymbol *SledsEnd = OutContext.createTempSymbol("xray_sleds_end", true); + OutStreamer->EmitLabel(SledsEnd); + + // We then emit a single entry in the index per function. We use the symbols + // that bound the instrumentation map as the range for a specific function. + // Each entry here will be 2 * word size aligned, as we're writing down two + // pointers. This should work for both 32-bit and 64-bit platforms. + OutStreamer->SwitchSection(FnSledIndex); + OutStreamer->EmitCodeAlignment(2 * WordSizeBytes); + OutStreamer->EmitLabel(IdxRef); + OutStreamer->EmitSymbolValue(SledsStart, WordSizeBytes); + OutStreamer->EmitSymbolValue(SledsEnd, WordSizeBytes); OutStreamer->SwitchSection(PrevSection); Sleds.clear(); } @@ -2702,8 +2840,11 @@ void AsmPrinter::recordSled(MCSymbol *Sled, const MachineInstr &MI, SledKind Kind) { auto Fn = MI.getParent()->getParent()->getFunction(); auto Attr = Fn->getFnAttribute("function-instrument"); + bool LogArgs = Fn->hasFnAttribute("xray-log-args"); bool AlwaysInstrument = Attr.isStringAttribute() && Attr.getValueAsString() == "xray-always"; + if (Kind == SledKind::FUNCTION_ENTER && LogArgs) + Kind = SledKind::LOG_ARGS_ENTER; Sleds.emplace_back( XRayFunctionEntry{ Sled, CurrentFnSym, Kind, AlwaysInstrument, Fn }); } diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp index 0185c38..0edf905 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp @@ -15,6 +15,7 @@ #include "DwarfDebug.h" #include "DwarfExpression.h" #include "llvm/ADT/Twine.h" +#include "llvm/BinaryFormat/Dwarf.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/DIE.h" #include "llvm/CodeGen/MachineFunction.h" @@ -26,7 +27,6 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MachineLocation.h" -#include "llvm/Support/Dwarf.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp index 165b8ee..eae79ad 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp @@ -11,9 +11,9 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/AsmPrinter.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/Twine.h" +#include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineModuleInfo.h" @@ -48,10 +48,16 @@ static void srcMgrDiagHandler(const SMDiagnostic &Diag, void *diagInfo) { static_cast<AsmPrinter::SrcMgrDiagInfo *>(diagInfo); assert(DiagInfo && "Diagnostic context not passed down?"); + // Look up a LocInfo for the buffer this diagnostic is coming from. + unsigned BufNum = DiagInfo->SrcMgr.FindBufferContainingLoc(Diag.getLoc()); + const MDNode *LocInfo = nullptr; + if (BufNum > 0 && BufNum <= DiagInfo->LocInfos.size()) + LocInfo = DiagInfo->LocInfos[BufNum-1]; + // If the inline asm had metadata associated with it, pull out a location // cookie corresponding to which line the error occurred on. unsigned LocCookie = 0; - if (const MDNode *LocInfo = DiagInfo->LocInfo) { + if (LocInfo) { unsigned ErrorLine = Diag.getLineNo()-1; if (ErrorLine >= LocInfo->getNumOperands()) ErrorLine = 0; @@ -108,7 +114,6 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI, SourceMgr &SrcMgr = DiagInfo->SrcMgr; SrcMgr.setIncludeDirs(MCOptions.IASSearchPaths); - DiagInfo->LocInfo = LocMDNode; std::unique_ptr<MemoryBuffer> Buffer; // The inline asm source manager will outlive Str, so make a copy of the @@ -118,6 +123,12 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI, // Tell SrcMgr about this buffer, it takes ownership of the buffer. unsigned BufNum = SrcMgr.AddNewSourceBuffer(std::move(Buffer), SMLoc()); + // Store LocMDNode in DiagInfo, using BufNum as an identifier. + if (LocMDNode) { + DiagInfo->LocInfos.resize(BufNum); + DiagInfo->LocInfos[BufNum-1] = LocMDNode; + } + std::unique_ptr<MCAsmParser> Parser( createMCAsmParser(SrcMgr, OutContext, *OutStreamer, *MAI, BufNum)); @@ -133,6 +144,9 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI, " we don't have an asm parser for this target\n"); Parser->setAssemblerDialect(Dialect); Parser->setTargetParser(*TAP.get()); + if (Dialect == InlineAsm::AD_Intel) + // We need this flag to be able to parse numbers like "0bH" + Parser->setParsingInlineAsm(true); if (MF) { const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); TAP->SetFrameRegister(TRI->getFrameRegister(*MF)); @@ -144,11 +158,6 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI, /*NoFinalize*/ true); emitInlineAsmEnd(STI, &TAP->getSTI()); - // LocInfo cannot be used for error generation from the backend. - // FIXME: associate LocInfo with the SourceBuffer to improve backend - // messages. - DiagInfo->LocInfo = nullptr; - if (Res && !DiagInfo->DiagHandler) report_fatal_error("Error parsing inline asm\n"); } diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index 8344051..a81d56e 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -1,4 +1,4 @@ -//===-- llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp --*- C++ -*--===// +//===- llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp ----------------------===// // // The LLVM Compiler Infrastructure // @@ -12,37 +12,82 @@ //===----------------------------------------------------------------------===// #include "CodeViewDebug.h" +#include "llvm/ADT/APSInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/TinyPtrVector.h" -#include "llvm/DebugInfo/CodeView/CVTypeDumper.h" +#include "llvm/ADT/Triple.h" +#include "llvm/ADT/Twine.h" +#include "llvm/BinaryFormat/COFF.h" +#include "llvm/BinaryFormat/Dwarf.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/LexicalScopes.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/Config/llvm-config.h" #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h" #include "llvm/DebugInfo/CodeView/CodeView.h" +#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h" #include "llvm/DebugInfo/CodeView/Line.h" #include "llvm/DebugInfo/CodeView/SymbolRecord.h" -#include "llvm/DebugInfo/CodeView/TypeDatabase.h" #include "llvm/DebugInfo/CodeView/TypeDumpVisitor.h" #include "llvm/DebugInfo/CodeView/TypeIndex.h" #include "llvm/DebugInfo/CodeView/TypeRecord.h" -#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h" -#include "llvm/DebugInfo/MSF/ByteStream.h" -#include "llvm/DebugInfo/MSF/StreamReader.h" +#include "llvm/DebugInfo/CodeView/TypeTableCollection.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" #include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionCOFF.h" +#include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Support/COFF.h" +#include "llvm/Support/BinaryByteStream.h" +#include "llvm/Support/BinaryStreamReader.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ScopedPrinter.h" +#include "llvm/Support/SMLoc.h" #include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cctype> +#include <cstddef> +#include <cstdint> +#include <iterator> +#include <limits> +#include <string> +#include <utility> +#include <vector> using namespace llvm; using namespace llvm::codeview; -using namespace llvm::msf; CodeViewDebug::CodeViewDebug(AsmPrinter *AP) - : DebugHandlerBase(AP), OS(*Asm->OutStreamer), Allocator(), - TypeTable(Allocator), CurFn(nullptr) { + : DebugHandlerBase(AP), OS(*Asm->OutStreamer), TypeTable(Allocator) { // If module doesn't have named metadata anchors or COFF debug section // is not available, skip any debug info related stuff. if (!MMI->getModule()->getNamedMetadata("llvm.dbg.cu") || @@ -178,7 +223,8 @@ static const DISubprogram *getQualifiedNameComponents( static std::string getQualifiedName(ArrayRef<StringRef> QualifiedNameComponents, StringRef TypeName) { std::string FullyQualifiedName; - for (StringRef QualifiedNameComponent : reverse(QualifiedNameComponents)) { + for (StringRef QualifiedNameComponent : + llvm::reverse(QualifiedNameComponents)) { FullyQualifiedName.append(QualifiedNameComponent); FullyQualifiedName.append("::"); } @@ -238,7 +284,7 @@ TypeIndex CodeViewDebug::getFuncIdForSubprogram(const DISubprogram *SP) { // The display name includes function template arguments. Drop them to match // MSVC. - StringRef DisplayName = SP->getDisplayName().split('<').first; + StringRef DisplayName = SP->getName().split('<').first; const DIScope *Scope = SP->getScope().resolve(); TypeIndex TI; @@ -319,7 +365,7 @@ static void addLocIfNotPresent(SmallVectorImpl<const DILocation *> &Locs, void CodeViewDebug::maybeRecordLocation(const DebugLoc &DL, const MachineFunction *MF) { // Skip this instruction if it has the same location as the previous one. - if (DL == CurFn->LastLoc) + if (!DL || DL == PrevInstLoc) return; const DIScope *Scope = DL.get()->getScope(); @@ -339,11 +385,11 @@ void CodeViewDebug::maybeRecordLocation(const DebugLoc &DL, if (!CurFn->HaveLineInfo) CurFn->HaveLineInfo = true; unsigned FileId = 0; - if (CurFn->LastLoc.get() && CurFn->LastLoc->getFile() == DL->getFile()) + if (PrevInstLoc.get() && PrevInstLoc->getFile() == DL->getFile()) FileId = CurFn->LastFileId; else FileId = CurFn->LastFileId = maybeRecordFile(DL->getFile()); - CurFn->LastLoc = DL; + PrevInstLoc = DL; unsigned FuncId = CurFn->FuncId; if (const DILocation *SiteLoc = DL->getInlinedAt()) { @@ -393,7 +439,7 @@ void CodeViewDebug::endModule() { // subprograms. switchToDebugSectionForSymbol(nullptr); - MCSymbol *CompilerInfo = beginCVSubsection(ModuleSubstreamKind::Symbols); + MCSymbol *CompilerInfo = beginCVSubsection(DebugSubsectionKind::Symbols); emitCompilerInformation(); endCVSubsection(CompilerInfo); @@ -417,7 +463,7 @@ void CodeViewDebug::endModule() { // Emit UDT records for any types used by global variables. if (!GlobalUDTs.empty()) { - MCSymbol *SymbolsEnd = beginCVSubsection(ModuleSubstreamKind::Symbols); + MCSymbol *SymbolsEnd = beginCVSubsection(DebugSubsectionKind::Symbols); emitDebugInfoForUDTs(GlobalUDTs); endCVSubsection(SymbolsEnd); } @@ -469,17 +515,21 @@ void CodeViewDebug::emitTypeInformation() { CommentPrefix += ' '; } - TypeDatabase TypeDB; - CVTypeDumper CVTD(TypeDB); - TypeTable.ForEachRecord([&](TypeIndex Index, ArrayRef<uint8_t> Record) { + TypeTableCollection Table(TypeTable.records()); + Optional<TypeIndex> B = Table.getFirst(); + while (B) { + // This will fail if the record data is invalid. + CVType Record = Table.getType(*B); + if (OS.isVerboseAsm()) { // Emit a block comment describing the type record for readability. SmallString<512> CommentBlock; raw_svector_ostream CommentOS(CommentBlock); ScopedPrinter SP(CommentOS); SP.setPrefix(CommentPrefix); - TypeDumpVisitor TDV(TypeDB, &SP, false); - Error E = CVTD.dump(Record, TDV); + TypeDumpVisitor TDV(Table, &SP, false); + + Error E = codeview::visitTypeRecord(Record, *B, TDV); if (E) { logAllUnhandledErrors(std::move(E), errs(), "error: "); llvm_unreachable("produced malformed type record"); @@ -489,29 +539,10 @@ void CodeViewDebug::emitTypeInformation() { // newline. OS.emitRawComment( CommentOS.str().drop_front(CommentPrefix.size() - 1).rtrim()); - } else { -#ifndef NDEBUG - // Assert that the type data is valid even if we aren't dumping - // comments. The MSVC linker doesn't do much type record validation, - // so the first link of an invalid type record can succeed while - // subsequent links will fail with LNK1285. - ByteStream Stream(Record); - CVTypeArray Types; - StreamReader Reader(Stream); - Error E = Reader.readArray(Types, Reader.getLength()); - if (!E) { - TypeVisitorCallbacks C; - E = CVTypeVisitor(C).visitTypeStream(Types); - } - if (E) { - logAllUnhandledErrors(std::move(E), errs(), "error: "); - llvm_unreachable("produced malformed type record"); - } -#endif } - StringRef S(reinterpret_cast<const char *>(Record.data()), Record.size()); - OS.EmitBinaryData(S); - }); + OS.EmitBinaryData(Record.str_data()); + B = Table.getNext(*B); + } } namespace { @@ -586,7 +617,7 @@ static CPUType mapArchToCVCPUType(Triple::ArchType Type) { } } -} // anonymous namespace +} // end anonymous namespace void CodeViewDebug::emitCompilerInformation() { MCContext &Context = MMI->getContext(); @@ -645,7 +676,7 @@ void CodeViewDebug::emitInlineeLinesSubsection() { return; OS.AddComment("Inlinee lines subsection"); - MCSymbol *InlineEnd = beginCVSubsection(ModuleSubstreamKind::InlineeLines); + MCSymbol *InlineEnd = beginCVSubsection(DebugSubsectionKind::InlineeLines); // We don't provide any extra file info. // FIXME: Find out if debuggers use this info. @@ -658,7 +689,7 @@ void CodeViewDebug::emitInlineeLinesSubsection() { OS.AddBlankLine(); unsigned FileId = maybeRecordFile(SP->getFile()); - OS.AddComment("Inlined function " + SP->getDisplayName() + " starts at " + + OS.AddComment("Inlined function " + SP->getName() + " starts at " + SP->getFilename() + Twine(':') + Twine(SP->getLine())); OS.AddBlankLine(); // The filechecksum table uses 8 byte entries for now, and file ids start at @@ -760,17 +791,17 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV, // If we have a display name, build the fully qualified name by walking the // chain of scopes. - if (!SP->getDisplayName().empty()) + if (!SP->getName().empty()) FuncName = - getFullyQualifiedName(SP->getScope().resolve(), SP->getDisplayName()); + getFullyQualifiedName(SP->getScope().resolve(), SP->getName()); // If our DISubprogram name is empty, use the mangled name. if (FuncName.empty()) - FuncName = GlobalValue::getRealLinkageName(GV->getName()); + FuncName = GlobalValue::dropLLVMManglingEscape(GV->getName()); // Emit a symbol subsection, required by VS2012+ to find function boundaries. OS.AddComment("Symbol subsection for " + Twine(FuncName)); - MCSymbol *SymbolsEnd = beginCVSubsection(ModuleSubstreamKind::Symbols); + MCSymbol *SymbolsEnd = beginCVSubsection(DebugSubsectionKind::Symbols); { MCSymbol *ProcRecordBegin = MMI->getContext().createTempSymbol(), *ProcRecordEnd = MMI->getContext().createTempSymbol(); @@ -887,13 +918,21 @@ void CodeViewDebug::collectVariableInfoFromMFTable( if (!Scope) continue; + // If the variable has an attached offset expression, extract it. + // FIXME: Try to handle DW_OP_deref as well. + int64_t ExprOffset = 0; + if (VI.Expr) + if (!VI.Expr->extractIfOffset(ExprOffset)) + continue; + // Get the frame register used and the offset. unsigned FrameReg = 0; int FrameOffset = TFI->getFrameIndexReference(*Asm->MF, VI.Slot, FrameReg); uint16_t CVReg = TRI->getCodeViewRegNum(FrameReg); // Calculate the label ranges. - LocalVarDefRange DefRange = createDefRangeMem(CVReg, FrameOffset); + LocalVarDefRange DefRange = + createDefRangeMem(CVReg, FrameOffset + ExprOffset); for (const InsnRange &Range : Scope->getRanges()) { const MCSymbol *Begin = getLabelBeforeInsn(Range.first); const MCSymbol *End = getLabelAfterInsn(Range.second); @@ -948,10 +987,10 @@ void CodeViewDebug::collectVariableInfo(const DISubprogram *SP) { // Handle fragments. auto Fragment = DIExpr->getFragmentInfo(); - if (DIExpr && Fragment) { + if (Fragment) { IsSubfield = true; StructOffset = Fragment->OffsetInBits / 8; - } else if (DIExpr && DIExpr->getNumElements() > 0) { + } else if (DIExpr->getNumElements() > 0) { continue; // Ignore unrecognized exprs. } @@ -1014,14 +1053,7 @@ void CodeViewDebug::collectVariableInfo(const DISubprogram *SP) { } } -void CodeViewDebug::beginFunction(const MachineFunction *MF) { - assert(!CurFn && "Can't process two functions at once!"); - - if (!Asm || !MMI->hasDebugInfo() || !MF->getFunction()->getSubprogram()) - return; - - DebugHandlerBase::beginFunction(MF); - +void CodeViewDebug::beginFunctionImpl(const MachineFunction *MF) { const Function *GV = MF->getFunction(); assert(FnDebugInfo.count(GV) == false); CurFn = &FnDebugInfo[GV]; @@ -1038,11 +1070,11 @@ void CodeViewDebug::beginFunction(const MachineFunction *MF) { bool EmptyPrologue = true; for (const auto &MBB : *MF) { for (const auto &MI : MBB) { - if (!MI.isDebugValue() && !MI.getFlag(MachineInstr::FrameSetup) && + if (!MI.isMetaInstruction() && !MI.getFlag(MachineInstr::FrameSetup) && MI.getDebugLoc()) { PrologEndLoc = MI.getDebugLoc(); break; - } else if (!MI.isDebugValue()) { + } else if (!MI.isMetaInstruction()) { EmptyPrologue = false; } } @@ -1144,33 +1176,12 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) { DITypeRef ElementTypeRef = Ty->getBaseType(); TypeIndex ElementTypeIndex = getTypeIndex(ElementTypeRef); // IndexType is size_t, which depends on the bitness of the target. - TypeIndex IndexType = Asm->MAI->getPointerSize() == 8 + TypeIndex IndexType = Asm->TM.getPointerSize() == 8 ? TypeIndex(SimpleTypeKind::UInt64Quad) : TypeIndex(SimpleTypeKind::UInt32Long); uint64_t ElementSize = getBaseTypeSize(ElementTypeRef) / 8; - - // We want to assert that the element type multiplied by the array lengths - // match the size of the overall array. However, if we don't have complete - // type information for the base type, we can't make this assertion. This - // happens if limited debug info is enabled in this case: - // struct VTableOptzn { VTableOptzn(); virtual ~VTableOptzn(); }; - // VTableOptzn array[3]; - // The DICompositeType of VTableOptzn will have size zero, and the array will - // have size 3 * sizeof(void*), and we should avoid asserting. - // - // There is a related bug in the front-end where an array of a structure, - // which was declared as incomplete structure first, ends up not getting a - // size assigned to it. (PR28303) - // Example: - // struct A(*p)[3]; - // struct A { int f; } a[3]; - bool PartiallyIncomplete = false; - if (Ty->getSizeInBits() == 0 || ElementSize == 0) { - PartiallyIncomplete = true; - } - // Add subranges to array type. DINodeArray Elements = Ty->getElements(); for (int i = Elements.size() - 1; i >= 0; --i) { @@ -1185,16 +1196,14 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) { // Variable Length Array (VLA) has Count equal to '-1'. // Replace with Count '1', assume it is the minimum VLA length. // FIXME: Make front-end support VLA subrange and emit LF_DIMVARLU. - if (Count == -1) { + if (Count == -1) Count = 1; - PartiallyIncomplete = true; - } // Update the element size and element type index for subsequent subranges. ElementSize *= Count; // If this is the outermost array, use the size from the array. It will be - // more accurate if PartiallyIncomplete is true. + // more accurate if we had a VLA or an incomplete element type size. uint64_t ArraySize = (i == 0 && ElementSize == 0) ? Ty->getSizeInBits() / 8 : ElementSize; @@ -1203,9 +1212,6 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) { ElementTypeIndex = TypeTable.writeKnownType(AR); } - (void)PartiallyIncomplete; - assert(PartiallyIncomplete || ElementSize == (Ty->getSizeInBits() / 8)); - return ElementTypeIndex; } @@ -1376,8 +1382,8 @@ TypeIndex CodeViewDebug::lowerTypeMemberPointer(const DIDerivedType *Ty) { assert(Ty->getTag() == dwarf::DW_TAG_ptr_to_member_type); TypeIndex ClassTI = getTypeIndex(Ty->getClassType()); TypeIndex PointeeTI = getTypeIndex(Ty->getBaseType(), Ty->getClassType()); - PointerKind PK = Asm->MAI->getPointerSize() == 8 ? PointerKind::Near64 - : PointerKind::Near32; + PointerKind PK = Asm->TM.getPointerSize() == 8 ? PointerKind::Near64 + : PointerKind::Near32; bool IsPMF = isa<DISubroutineType>(Ty->getBaseType()); PointerMode PM = IsPMF ? PointerMode::PointerToMemberFunction : PointerMode::PointerToDataMember; @@ -1492,7 +1498,8 @@ TypeIndex CodeViewDebug::lowerTypeMemberFunction(const DISubroutineType *Ty, } TypeIndex CodeViewDebug::lowerTypeVFTableShape(const DIDerivedType *Ty) { - unsigned VSlotCount = Ty->getSizeInBits() / (8 * Asm->MAI->getPointerSize()); + unsigned VSlotCount = + Ty->getSizeInBits() / (8 * Asm->MAI->getCodePointerSize()); SmallVector<VFTableSlotKind, 4> Slots(VSlotCount, VFTableSlotKind::Near); VFTableShapeRecord VFTSR(Slots); @@ -1600,7 +1607,7 @@ TypeIndex CodeViewDebug::lowerTypeEnum(const DICompositeType *Ty) { EnumeratorCount++; } } - FTI = FLRB.end(); + FTI = FLRB.end(true); } std::string FullName = getFullyQualifiedName(Ty); @@ -1620,11 +1627,11 @@ struct llvm::ClassInfo { uint64_t BaseOffset; }; // [MemberInfo] - typedef std::vector<MemberInfo> MemberList; + using MemberList = std::vector<MemberInfo>; - typedef TinyPtrVector<const DISubprogram *> MethodsList; + using MethodsList = TinyPtrVector<const DISubprogram *>; // MethodName -> MethodsList - typedef MapVector<MDString *, MethodsList> MethodsMap; + using MethodsMap = MapVector<MDString *, MethodsList>; /// Base classes. std::vector<const DIDerivedType *> Inheritance; @@ -1736,10 +1743,12 @@ TypeIndex CodeViewDebug::lowerCompleteTypeClass(const DICompositeType *Ty) { SizeInBytes, FullName, Ty->getIdentifier()); TypeIndex ClassTI = TypeTable.writeKnownType(CR); - StringIdRecord SIDR(TypeIndex(0x0), getFullFilepath(Ty->getFile())); - TypeIndex SIDI = TypeTable.writeKnownType(SIDR); - UdtSourceLineRecord USLR(ClassTI, SIDI, Ty->getLine()); - TypeTable.writeKnownType(USLR); + if (const auto *File = Ty->getFile()) { + StringIdRecord SIDR(TypeIndex(0x0), getFullFilepath(File)); + TypeIndex SIDI = TypeTable.writeKnownType(SIDR); + UdtSourceLineRecord USLR(ClassTI, SIDI, Ty->getLine()); + TypeTable.writeKnownType(USLR); + } addToUDTs(Ty, ClassTI); @@ -1887,7 +1896,7 @@ CodeViewDebug::lowerRecordFieldList(const DICompositeType *Ty) { translateMethodOptionFlags(SP), VFTableOffset, Name)); MemberCount++; } - assert(Methods.size() > 0 && "Empty methods map entry"); + assert(!Methods.empty() && "Empty methods map entry"); if (Methods.size() == 1) FLBR.writeMemberType(Methods[0]); else { @@ -1905,7 +1914,7 @@ CodeViewDebug::lowerRecordFieldList(const DICompositeType *Ty) { MemberCount++; } - TypeIndex FieldTI = FLBR.end(); + TypeIndex FieldTI = FLBR.end(true); return std::make_tuple(FieldTI, Info.VShapeTI, MemberCount, !Info.NestedClasses.empty()); } @@ -2115,18 +2124,13 @@ void CodeViewDebug::emitLocalVariable(const LocalVariable &Var) { } } -void CodeViewDebug::endFunction(const MachineFunction *MF) { - if (!Asm || !CurFn) // We haven't created any debug info for this function. - return; - +void CodeViewDebug::endFunctionImpl(const MachineFunction *MF) { const Function *GV = MF->getFunction(); assert(FnDebugInfo.count(GV)); assert(CurFn == &FnDebugInfo[GV]); collectVariableInfo(GV->getSubprogram()); - DebugHandlerBase::endFunction(MF); - // Don't emit anything if we don't have any line tables. if (!CurFn->HaveLineInfo) { FnDebugInfo.erase(GV); @@ -2146,13 +2150,27 @@ void CodeViewDebug::beginInstruction(const MachineInstr *MI) { if (!Asm || !CurFn || MI->isDebugValue() || MI->getFlag(MachineInstr::FrameSetup)) return; + + // If the first instruction of a new MBB has no location, find the first + // instruction with a location and use that. DebugLoc DL = MI->getDebugLoc(); - if (DL == PrevInstLoc || !DL) + if (!DL && MI->getParent() != PrevInstBB) { + for (const auto &NextMI : *MI->getParent()) { + DL = NextMI.getDebugLoc(); + if (DL) + break; + } + } + PrevInstBB = MI->getParent(); + + // If we still don't have a debug location, don't record a location. + if (!DL) return; + maybeRecordLocation(DL, Asm->MF); } -MCSymbol *CodeViewDebug::beginCVSubsection(ModuleSubstreamKind Kind) { +MCSymbol *CodeViewDebug::beginCVSubsection(DebugSubsectionKind Kind) { MCSymbol *BeginLabel = MMI->getContext().createTempSymbol(), *EndLabel = MMI->getContext().createTempSymbol(); OS.EmitIntValue(unsigned(Kind), 4); @@ -2212,7 +2230,7 @@ void CodeViewDebug::emitDebugInfoForGlobals() { if (!GV->hasComdat() && !GV->isDeclarationForLinker()) { if (!EndLabel) { OS.AddComment("Symbol subsection for globals"); - EndLabel = beginCVSubsection(ModuleSubstreamKind::Symbols); + EndLabel = beginCVSubsection(DebugSubsectionKind::Symbols); } // FIXME: emitDebugInfoForGlobal() doesn't handle DIExpressions. emitDebugInfoForGlobal(GVE->getVariable(), GV, Asm->getSymbol(GV)); @@ -2228,9 +2246,9 @@ void CodeViewDebug::emitDebugInfoForGlobals() { if (GV->hasComdat()) { MCSymbol *GVSym = Asm->getSymbol(GV); OS.AddComment("Symbol subsection for " + - Twine(GlobalValue::getRealLinkageName(GV->getName()))); + Twine(GlobalValue::dropLLVMManglingEscape(GV->getName()))); switchToDebugSectionForSymbol(GVSym); - EndLabel = beginCVSubsection(ModuleSubstreamKind::Symbols); + EndLabel = beginCVSubsection(DebugSubsectionKind::Symbols); // FIXME: emitDebugInfoForGlobal() doesn't handle DIExpressions. emitDebugInfoForGlobal(GVE->getVariable(), GV, GVSym); endCVSubsection(EndLabel); diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h index 3dd4315..fd8f604 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h @@ -1,4 +1,4 @@ -//===-- llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h ----*- C++ -*--===// +//===- llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h --------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -14,29 +14,44 @@ #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_CODEVIEWDEBUG_H #define LLVM_LIB_CODEGEN_ASMPRINTER_CODEVIEWDEBUG_H +#include "DbgValueHistoryCalculator.h" #include "DebugHandlerBase.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/StringMap.h" -#include "llvm/CodeGen/AsmPrinter.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/DebugInfo/CodeView/TypeIndex.h" #include "llvm/DebugInfo/CodeView/TypeTableBuilder.h" -#include "llvm/IR/DebugInfo.h" #include "llvm/IR/DebugLoc.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Compiler.h" +#include <cstdint> +#include <map> +#include <string> +#include <tuple> +#include <unordered_map> +#include <utility> +#include <vector> namespace llvm { -class StringRef; -class LexicalScope; struct ClassInfo; +class StringRef; +class AsmPrinter; +class Function; +class GlobalVariable; +class MCSectionCOFF; +class MCStreamer; +class MCSymbol; +class MachineFunction; /// \brief Collects and handles line tables information in a CodeView format. class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase { MCStreamer &OS; - llvm::BumpPtrAllocator Allocator; + BumpPtrAllocator Allocator; codeview::TypeTableBuilder TypeTable; /// Represents the most general definition range. @@ -103,14 +118,13 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase { SmallVector<LocalVariable, 1> Locals; - DebugLoc LastLoc; const MCSymbol *Begin = nullptr; const MCSymbol *End = nullptr; unsigned FuncId = 0; unsigned LastFileId = 0; bool HaveLineInfo = false; }; - FunctionInfo *CurFn; + FunctionInfo *CurFn = nullptr; /// The set of comdat .debug$S sections that we've seen so far. Each section /// must start with a magic version number that must only be emitted once. @@ -176,8 +190,9 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase { std::vector<std::pair<std::string, codeview::TypeIndex>> LocalUDTs, GlobalUDTs; - typedef std::map<const DIFile *, std::string> FileToFilepathMapTy; + using FileToFilepathMapTy = std::map<const DIFile *, std::string>; FileToFilepathMapTy FileToFilepathMap; + StringRef getFullFilepath(const DIFile *S); unsigned maybeRecordFile(const DIFile *F); @@ -216,14 +231,14 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase { /// Opens a subsection of the given kind in a .debug$S codeview section. /// Returns an end label for use with endCVSubsection when the subsection is /// finished. - MCSymbol *beginCVSubsection(codeview::ModuleSubstreamKind Kind); + MCSymbol *beginCVSubsection(codeview::DebugSubsectionKind Kind); void endCVSubsection(MCSymbol *EndLabel); void emitInlinedCallSite(const FunctionInfo &FI, const DILocation *InlinedAt, const InlineSite &Site); - typedef DbgValueHistoryMap::InlinedVariable InlinedVariable; + using InlinedVariable = DbgValueHistoryMap::InlinedVariable; void collectVariableInfo(const DISubprogram *SP); @@ -299,23 +314,25 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase { unsigned getPointerSizeInBytes(); +protected: + /// \brief Gather pre-function debug information. + void beginFunctionImpl(const MachineFunction *MF) override; + + /// \brief Gather post-function debug information. + void endFunctionImpl(const MachineFunction *) override; + public: CodeViewDebug(AsmPrinter *Asm); - void setSymbolSize(const llvm::MCSymbol *, uint64_t) override {} + void setSymbolSize(const MCSymbol *, uint64_t) override {} /// \brief Emit the COFF section that holds the line table information. void endModule() override; - /// \brief Gather pre-function debug information. - void beginFunction(const MachineFunction *MF) override; - - /// \brief Gather post-function debug information. - void endFunction(const MachineFunction *) override; - /// \brief Process beginning of an instruction. void beginInstruction(const MachineInstr *MI) override; }; -} // End of namespace llvm -#endif +} // end namespace llvm + +#endif // LLVM_LIB_CODEGEN_ASMPRINTER_CODEVIEWDEBUG_H diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp index 8799189..886e6e2 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp @@ -31,6 +31,8 @@ #include "llvm/Support/raw_ostream.h" using namespace llvm; +#define DEBUG_TYPE "dwarfdebug" + //===----------------------------------------------------------------------===// // DIEAbbrevData Implementation //===----------------------------------------------------------------------===// @@ -42,6 +44,8 @@ void DIEAbbrevData::Profile(FoldingSetNodeID &ID) const { // overloads. Otherwise MSVC 2010 thinks this call is ambiguous. ID.AddInteger(unsigned(Attribute)); ID.AddInteger(unsigned(Form)); + if (Form == dwarf::DW_FORM_implicit_const) + ID.AddInteger(Value); } //===----------------------------------------------------------------------===// @@ -77,15 +81,22 @@ void DIEAbbrev::Emit(const AsmPrinter *AP) const { dwarf::AttributeString(AttrData.getAttribute()).data()); // Emit form type. +#ifndef NDEBUG + // Could be an assertion, but this way we can see the failing form code + // easily, which helps track down where it came from. + if (!dwarf::isValidFormForVersion(AttrData.getForm(), + AP->getDwarfVersion())) { + DEBUG(dbgs() << "Invalid form " << format("0x%x", AttrData.getForm()) + << " for DWARF version " << AP->getDwarfVersion() << "\n"); + llvm_unreachable("Invalid form for specified DWARF version"); + } +#endif AP->EmitULEB128(AttrData.getForm(), dwarf::FormEncodingString(AttrData.getForm()).data()); // Emit value for DW_FORM_implicit_const. - if (AttrData.getForm() == dwarf::DW_FORM_implicit_const) { - assert(AP->getDwarfVersion() >= 5 && - "DW_FORM_implicit_const is supported starting from DWARFv5"); + if (AttrData.getForm() == dwarf::DW_FORM_implicit_const) AP->EmitSLEB128(AttrData.getValue()); - } } // Mark end of abbreviation. @@ -94,7 +105,7 @@ void DIEAbbrev::Emit(const AsmPrinter *AP) const { } LLVM_DUMP_METHOD -void DIEAbbrev::print(raw_ostream &O) { +void DIEAbbrev::print(raw_ostream &O) const { O << "Abbreviation @" << format("0x%lx", (long)(intptr_t)this) << " " @@ -107,13 +118,20 @@ void DIEAbbrev::print(raw_ostream &O) { O << " " << dwarf::AttributeString(Data[i].getAttribute()) << " " - << dwarf::FormEncodingString(Data[i].getForm()) - << '\n'; + << dwarf::FormEncodingString(Data[i].getForm()); + + if (Data[i].getForm() == dwarf::DW_FORM_implicit_const) + O << " " << Data[i].getValue(); + + O << '\n'; } } -LLVM_DUMP_METHOD -void DIEAbbrev::dump() { print(dbgs()); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void DIEAbbrev::dump() const { + print(dbgs()); +} +#endif //===----------------------------------------------------------------------===// // DIEAbbrevSet Implementation @@ -249,10 +267,11 @@ void DIE::print(raw_ostream &O, unsigned IndentCount) const { O << "\n"; } -LLVM_DUMP_METHOD -void DIE::dump() { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void DIE::dump() const { print(dbgs()); } +#endif unsigned DIE::computeOffsetsAndAbbrevs(const AsmPrinter *AP, DIEAbbrevSet &AbbrevSet, @@ -340,10 +359,11 @@ void DIEValue::print(raw_ostream &O) const { } } -LLVM_DUMP_METHOD -void DIEValue::dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void DIEValue::dump() const { print(dbgs()); } +#endif //===----------------------------------------------------------------------===// // DIEInteger Implementation @@ -354,57 +374,42 @@ void DIEValue::dump() const { void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const { switch (Form) { case dwarf::DW_FORM_implicit_const: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_flag_present: // Emit something to keep the lines and comments in sync. // FIXME: Is there a better way to do this? Asm->OutStreamer->AddBlankLine(); return; case dwarf::DW_FORM_flag: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_ref1: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_data1: - LLVM_FALLTHROUGH; + case dwarf::DW_FORM_strx1: + case dwarf::DW_FORM_addrx1: case dwarf::DW_FORM_ref2: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_data2: - LLVM_FALLTHROUGH; + case dwarf::DW_FORM_strx2: + case dwarf::DW_FORM_addrx2: case dwarf::DW_FORM_strp: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_ref4: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_data4: - LLVM_FALLTHROUGH; + case dwarf::DW_FORM_ref_sup4: + case dwarf::DW_FORM_strx4: + case dwarf::DW_FORM_addrx4: case dwarf::DW_FORM_ref8: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_ref_sig8: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_data8: - LLVM_FALLTHROUGH; + case dwarf::DW_FORM_ref_sup8: case dwarf::DW_FORM_GNU_ref_alt: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_GNU_strp_alt: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_line_strp: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_sec_offset: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_strp_sup: - LLVM_FALLTHROUGH; - case dwarf::DW_FORM_ref_sup: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_addr: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_ref_addr: Asm->OutStreamer->EmitIntValue(Integer, SizeOf(Asm, Form)); return; case dwarf::DW_FORM_GNU_str_index: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_GNU_addr_index: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_ref_udata: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_udata: Asm->EmitULEB128(Integer); return; @@ -419,35 +424,41 @@ void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const { /// unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { switch (Form) { - case dwarf::DW_FORM_implicit_const: LLVM_FALLTHROUGH; - case dwarf::DW_FORM_flag_present: return 0; - case dwarf::DW_FORM_flag: LLVM_FALLTHROUGH; - case dwarf::DW_FORM_ref1: LLVM_FALLTHROUGH; - case dwarf::DW_FORM_data1: return sizeof(int8_t); - case dwarf::DW_FORM_ref2: LLVM_FALLTHROUGH; - case dwarf::DW_FORM_data2: return sizeof(int16_t); - case dwarf::DW_FORM_ref4: LLVM_FALLTHROUGH; - case dwarf::DW_FORM_data4: return sizeof(int32_t); - case dwarf::DW_FORM_ref8: LLVM_FALLTHROUGH; - case dwarf::DW_FORM_ref_sig8: LLVM_FALLTHROUGH; - case dwarf::DW_FORM_data8: return sizeof(int64_t); + case dwarf::DW_FORM_implicit_const: + case dwarf::DW_FORM_flag_present: + return 0; + case dwarf::DW_FORM_flag: + case dwarf::DW_FORM_ref1: + case dwarf::DW_FORM_data1: + case dwarf::DW_FORM_strx1: + case dwarf::DW_FORM_addrx1: + return sizeof(int8_t); + case dwarf::DW_FORM_ref2: + case dwarf::DW_FORM_data2: + case dwarf::DW_FORM_strx2: + case dwarf::DW_FORM_addrx2: + return sizeof(int16_t); + case dwarf::DW_FORM_ref4: + case dwarf::DW_FORM_data4: + case dwarf::DW_FORM_ref_sup4: + case dwarf::DW_FORM_strx4: + case dwarf::DW_FORM_addrx4: + return sizeof(int32_t); + case dwarf::DW_FORM_ref8: + case dwarf::DW_FORM_ref_sig8: + case dwarf::DW_FORM_data8: + case dwarf::DW_FORM_ref_sup8: + return sizeof(int64_t); case dwarf::DW_FORM_ref_addr: if (AP->getDwarfVersion() == 2) return AP->getPointerSize(); LLVM_FALLTHROUGH; case dwarf::DW_FORM_strp: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_GNU_ref_alt: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_GNU_strp_alt: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_line_strp: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_sec_offset: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_strp_sup: - LLVM_FALLTHROUGH; - case dwarf::DW_FORM_ref_sup: switch (AP->OutStreamer->getContext().getDwarfFormat()) { case dwarf::DWARF32: return 4; @@ -456,11 +467,8 @@ unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { } llvm_unreachable("Invalid DWARF format"); case dwarf::DW_FORM_GNU_str_index: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_GNU_addr_index: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_ref_udata: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_udata: return getULEB128Size(Integer); case dwarf::DW_FORM_sdata: @@ -484,7 +492,7 @@ void DIEInteger::print(raw_ostream &O) const { /// EmitValue - Emit expression value. /// void DIEExpr::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const { - AP->EmitDebugValue(Expr, SizeOf(AP, Form)); + AP->EmitDebugThreadLocal(Expr, SizeOf(AP, Form)); } /// SizeOf - Determine size of expression value in bytes. @@ -519,7 +527,7 @@ unsigned DIELabel::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { if (Form == dwarf::DW_FORM_data4) return 4; if (Form == dwarf::DW_FORM_sec_offset) return 4; if (Form == dwarf::DW_FORM_strp) return 4; - return AP->getPointerSize(); + return AP->MAI->getCodePointerSize(); } LLVM_DUMP_METHOD @@ -541,7 +549,7 @@ unsigned DIEDelta::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { if (Form == dwarf::DW_FORM_data4) return 4; if (Form == dwarf::DW_FORM_sec_offset) return 4; if (Form == dwarf::DW_FORM_strp) return 4; - return AP->getPointerSize(); + return AP->MAI->getCodePointerSize(); } LLVM_DUMP_METHOD @@ -647,20 +655,12 @@ void DIEEntry::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const { case dwarf::DW_FORM_ref_addr: { // Get the absolute offset for this DIE within the debug info/types section. unsigned Addr = Entry->getDebugSectionOffset(); - if (AP->MAI->doesDwarfUseRelocationsAcrossSections()) { - const DwarfDebug *DD = AP->getDwarfDebug(); - if (DD) - assert(!DD->useSplitDwarf() && - "TODO: dwo files can't have relocations."); - const DIEUnit *Unit = Entry->getUnit(); - assert(Unit && "CUDie should belong to a CU."); - MCSection *Section = Unit->getSection(); - if (Section) { - const MCSymbol *SectionSym = Section->getBeginSymbol(); - AP->EmitLabelPlusOffset(SectionSym, Addr, SizeOf(AP, Form), true); - return; - } + if (const MCSymbol *SectionSym = + Entry->getUnit()->getCrossSectionRelativeBaseAddress()) { + AP->EmitLabelPlusOffset(SectionSym, Addr, SizeOf(AP, Form), true); + return; } + AP->OutStreamer->EmitIntValue(Addr, SizeOf(AP, Form)); return; } @@ -683,7 +683,7 @@ unsigned DIEEntry::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { return getULEB128Size(Entry->getOffset()); case dwarf::DW_FORM_ref_addr: if (AP->getDwarfVersion() == 2) - return AP->getPointerSize(); + return AP->MAI->getCodePointerSize(); switch (AP->OutStreamer->getContext().getDwarfFormat()) { case dwarf::DWARF32: return 4; @@ -809,7 +809,7 @@ unsigned DIELocList::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { return 4; if (Form == dwarf::DW_FORM_sec_offset) return 4; - return AP->getPointerSize(); + return AP->MAI->getCodePointerSize(); } /// EmitValue - Emit label value. diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp index d8ecc7c..15ade3c 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp @@ -11,15 +11,15 @@ // //===----------------------------------------------------------------------===// -#include "ByteStreamer.h" #include "DIEHash.h" +#include "ByteStreamer.h" #include "DwarfDebug.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" +#include "llvm/BinaryFormat/Dwarf.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/DIE.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/Dwarf.h" #include "llvm/Support/Endian.h" #include "llvm/Support/MD5.h" #include "llvm/Support/raw_ostream.h" @@ -116,65 +116,17 @@ void DIEHash::addParentContext(const DIE &Parent) { // Collect all of the attributes for a particular DIE in single structure. void DIEHash::collectAttributes(const DIE &Die, DIEAttrs &Attrs) { -#define COLLECT_ATTR(NAME) \ - case dwarf::NAME: \ - Attrs.NAME = V; \ - break for (const auto &V : Die.values()) { DEBUG(dbgs() << "Attribute: " << dwarf::AttributeString(V.getAttribute()) << " added.\n"); switch (V.getAttribute()) { - COLLECT_ATTR(DW_AT_name); - COLLECT_ATTR(DW_AT_accessibility); - COLLECT_ATTR(DW_AT_address_class); - COLLECT_ATTR(DW_AT_allocated); - COLLECT_ATTR(DW_AT_artificial); - COLLECT_ATTR(DW_AT_associated); - COLLECT_ATTR(DW_AT_binary_scale); - COLLECT_ATTR(DW_AT_bit_offset); - COLLECT_ATTR(DW_AT_bit_size); - COLLECT_ATTR(DW_AT_bit_stride); - COLLECT_ATTR(DW_AT_byte_size); - COLLECT_ATTR(DW_AT_byte_stride); - COLLECT_ATTR(DW_AT_const_expr); - COLLECT_ATTR(DW_AT_const_value); - COLLECT_ATTR(DW_AT_containing_type); - COLLECT_ATTR(DW_AT_count); - COLLECT_ATTR(DW_AT_data_bit_offset); - COLLECT_ATTR(DW_AT_data_location); - COLLECT_ATTR(DW_AT_data_member_location); - COLLECT_ATTR(DW_AT_decimal_scale); - COLLECT_ATTR(DW_AT_decimal_sign); - COLLECT_ATTR(DW_AT_default_value); - COLLECT_ATTR(DW_AT_digit_count); - COLLECT_ATTR(DW_AT_discr); - COLLECT_ATTR(DW_AT_discr_list); - COLLECT_ATTR(DW_AT_discr_value); - COLLECT_ATTR(DW_AT_encoding); - COLLECT_ATTR(DW_AT_enum_class); - COLLECT_ATTR(DW_AT_endianity); - COLLECT_ATTR(DW_AT_explicit); - COLLECT_ATTR(DW_AT_is_optional); - COLLECT_ATTR(DW_AT_location); - COLLECT_ATTR(DW_AT_lower_bound); - COLLECT_ATTR(DW_AT_mutable); - COLLECT_ATTR(DW_AT_ordering); - COLLECT_ATTR(DW_AT_picture_string); - COLLECT_ATTR(DW_AT_prototyped); - COLLECT_ATTR(DW_AT_small); - COLLECT_ATTR(DW_AT_segment); - COLLECT_ATTR(DW_AT_string_length); - COLLECT_ATTR(DW_AT_threads_scaled); - COLLECT_ATTR(DW_AT_upper_bound); - COLLECT_ATTR(DW_AT_use_location); - COLLECT_ATTR(DW_AT_use_UTF8); - COLLECT_ATTR(DW_AT_variable_parameter); - COLLECT_ATTR(DW_AT_virtuality); - COLLECT_ATTR(DW_AT_visibility); - COLLECT_ATTR(DW_AT_vtable_elem_location); - COLLECT_ATTR(DW_AT_type); +#define HANDLE_DIE_HASH_ATTR(NAME) \ + case dwarf::NAME: \ + Attrs.NAME = V; \ + break; +#include "DIEHashAttributes.def" default: break; } @@ -366,62 +318,12 @@ void DIEHash::hashAttribute(const DIEValue &Value, dwarf::Tag Tag) { // Go through the attributes from \param Attrs in the order specified in 7.27.4 // and hash them. void DIEHash::hashAttributes(const DIEAttrs &Attrs, dwarf::Tag Tag) { -#define ADD_ATTR(ATTR) \ +#define HANDLE_DIE_HASH_ATTR(NAME) \ { \ - if (ATTR) \ - hashAttribute(ATTR, Tag); \ + if (Attrs.NAME) \ + hashAttribute(Attrs.NAME, Tag); \ } - - ADD_ATTR(Attrs.DW_AT_name); - ADD_ATTR(Attrs.DW_AT_accessibility); - ADD_ATTR(Attrs.DW_AT_address_class); - ADD_ATTR(Attrs.DW_AT_allocated); - ADD_ATTR(Attrs.DW_AT_artificial); - ADD_ATTR(Attrs.DW_AT_associated); - ADD_ATTR(Attrs.DW_AT_binary_scale); - ADD_ATTR(Attrs.DW_AT_bit_offset); - ADD_ATTR(Attrs.DW_AT_bit_size); - ADD_ATTR(Attrs.DW_AT_bit_stride); - ADD_ATTR(Attrs.DW_AT_byte_size); - ADD_ATTR(Attrs.DW_AT_byte_stride); - ADD_ATTR(Attrs.DW_AT_const_expr); - ADD_ATTR(Attrs.DW_AT_const_value); - ADD_ATTR(Attrs.DW_AT_containing_type); - ADD_ATTR(Attrs.DW_AT_count); - ADD_ATTR(Attrs.DW_AT_data_bit_offset); - ADD_ATTR(Attrs.DW_AT_data_location); - ADD_ATTR(Attrs.DW_AT_data_member_location); - ADD_ATTR(Attrs.DW_AT_decimal_scale); - ADD_ATTR(Attrs.DW_AT_decimal_sign); - ADD_ATTR(Attrs.DW_AT_default_value); - ADD_ATTR(Attrs.DW_AT_digit_count); - ADD_ATTR(Attrs.DW_AT_discr); - ADD_ATTR(Attrs.DW_AT_discr_list); - ADD_ATTR(Attrs.DW_AT_discr_value); - ADD_ATTR(Attrs.DW_AT_encoding); - ADD_ATTR(Attrs.DW_AT_enum_class); - ADD_ATTR(Attrs.DW_AT_endianity); - ADD_ATTR(Attrs.DW_AT_explicit); - ADD_ATTR(Attrs.DW_AT_is_optional); - ADD_ATTR(Attrs.DW_AT_location); - ADD_ATTR(Attrs.DW_AT_lower_bound); - ADD_ATTR(Attrs.DW_AT_mutable); - ADD_ATTR(Attrs.DW_AT_ordering); - ADD_ATTR(Attrs.DW_AT_picture_string); - ADD_ATTR(Attrs.DW_AT_prototyped); - ADD_ATTR(Attrs.DW_AT_small); - ADD_ATTR(Attrs.DW_AT_segment); - ADD_ATTR(Attrs.DW_AT_string_length); - ADD_ATTR(Attrs.DW_AT_threads_scaled); - ADD_ATTR(Attrs.DW_AT_upper_bound); - ADD_ATTR(Attrs.DW_AT_use_location); - ADD_ATTR(Attrs.DW_AT_use_UTF8); - ADD_ATTR(Attrs.DW_AT_variable_parameter); - ADD_ATTR(Attrs.DW_AT_virtuality); - ADD_ATTR(Attrs.DW_AT_visibility); - ADD_ATTR(Attrs.DW_AT_vtable_elem_location); - ADD_ATTR(Attrs.DW_AT_type); - +#include "DIEHashAttributes.def" // FIXME: Add the extended attributes. } @@ -478,10 +380,12 @@ void DIEHash::computeHash(const DIE &Die) { /// DWARF4 standard. It is an md5 hash of the flattened description of the DIE /// with the inclusion of the full CU and all top level CU entities. // TODO: Initialize the type chain at 0 instead of 1 for CU signatures. -uint64_t DIEHash::computeCUSignature(const DIE &Die) { +uint64_t DIEHash::computeCUSignature(StringRef DWOName, const DIE &Die) { Numbering.clear(); Numbering[&Die] = 1; + if (!DWOName.empty()) + Hash.update(DWOName); // Hash the DIE. computeHash(Die); @@ -490,9 +394,9 @@ uint64_t DIEHash::computeCUSignature(const DIE &Die) { Hash.final(Result); // ... take the least significant 8 bytes and return those. Our MD5 - // implementation always returns its results in little endian, swap bytes - // appropriately. - return support::endian::read64le(Result + 8); + // implementation always returns its results in little endian, so we actually + // need the "high" word. + return Result.high(); } /// This is based on the type signature computation given in section 7.27 of the @@ -514,7 +418,7 @@ uint64_t DIEHash::computeTypeSignature(const DIE &Die) { Hash.final(Result); // ... take the least significant 8 bytes and return those. Our MD5 - // implementation always returns its results in little endian, swap bytes - // appropriately. - return support::endian::read64le(Result + 8); + // implementation always returns its results in little endian, so we actually + // need the "high" word. + return Result.high(); } diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h index 996cd7e..29337ae 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h @@ -28,64 +28,15 @@ class CompileUnit; class DIEHash { // Collection of all attributes used in hashing a particular DIE. struct DIEAttrs { - DIEValue DW_AT_name; - DIEValue DW_AT_accessibility; - DIEValue DW_AT_address_class; - DIEValue DW_AT_allocated; - DIEValue DW_AT_artificial; - DIEValue DW_AT_associated; - DIEValue DW_AT_binary_scale; - DIEValue DW_AT_bit_offset; - DIEValue DW_AT_bit_size; - DIEValue DW_AT_bit_stride; - DIEValue DW_AT_byte_size; - DIEValue DW_AT_byte_stride; - DIEValue DW_AT_const_expr; - DIEValue DW_AT_const_value; - DIEValue DW_AT_containing_type; - DIEValue DW_AT_count; - DIEValue DW_AT_data_bit_offset; - DIEValue DW_AT_data_location; - DIEValue DW_AT_data_member_location; - DIEValue DW_AT_decimal_scale; - DIEValue DW_AT_decimal_sign; - DIEValue DW_AT_default_value; - DIEValue DW_AT_digit_count; - DIEValue DW_AT_discr; - DIEValue DW_AT_discr_list; - DIEValue DW_AT_discr_value; - DIEValue DW_AT_encoding; - DIEValue DW_AT_enum_class; - DIEValue DW_AT_endianity; - DIEValue DW_AT_explicit; - DIEValue DW_AT_is_optional; - DIEValue DW_AT_location; - DIEValue DW_AT_lower_bound; - DIEValue DW_AT_mutable; - DIEValue DW_AT_ordering; - DIEValue DW_AT_picture_string; - DIEValue DW_AT_prototyped; - DIEValue DW_AT_small; - DIEValue DW_AT_segment; - DIEValue DW_AT_string_length; - DIEValue DW_AT_threads_scaled; - DIEValue DW_AT_upper_bound; - DIEValue DW_AT_use_location; - DIEValue DW_AT_use_UTF8; - DIEValue DW_AT_variable_parameter; - DIEValue DW_AT_virtuality; - DIEValue DW_AT_visibility; - DIEValue DW_AT_vtable_elem_location; - DIEValue DW_AT_type; - - // Insert any additional ones here... +#define HANDLE_DIE_HASH_ATTR(NAME) DIEValue NAME; +#include "DIEHashAttributes.def" }; public: DIEHash(AsmPrinter *A = nullptr) : AP(A) {} /// \brief Computes the CU signature. - uint64_t computeCUSignature(const DIE &Die); + uint64_t computeCUSignature(StringRef DWOName, const DIE &Die); /// \brief Computes the type signature. uint64_t computeTypeSignature(const DIE &Die); diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHashAttributes.def b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHashAttributes.def new file mode 100644 index 0000000..28a0239 --- /dev/null +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHashAttributes.def @@ -0,0 +1,55 @@ +#ifndef HANDLE_DIE_HASH_ATTR +#error "Missing macro definition of HANDLE_DIE_HASH_ATTR" +#endif + +HANDLE_DIE_HASH_ATTR(DW_AT_name) +HANDLE_DIE_HASH_ATTR(DW_AT_accessibility) +HANDLE_DIE_HASH_ATTR(DW_AT_address_class) +HANDLE_DIE_HASH_ATTR(DW_AT_allocated) +HANDLE_DIE_HASH_ATTR(DW_AT_artificial) +HANDLE_DIE_HASH_ATTR(DW_AT_associated) +HANDLE_DIE_HASH_ATTR(DW_AT_binary_scale) +HANDLE_DIE_HASH_ATTR(DW_AT_bit_offset) +HANDLE_DIE_HASH_ATTR(DW_AT_bit_size) +HANDLE_DIE_HASH_ATTR(DW_AT_bit_stride) +HANDLE_DIE_HASH_ATTR(DW_AT_byte_size) +HANDLE_DIE_HASH_ATTR(DW_AT_byte_stride) +HANDLE_DIE_HASH_ATTR(DW_AT_const_expr) +HANDLE_DIE_HASH_ATTR(DW_AT_const_value) +HANDLE_DIE_HASH_ATTR(DW_AT_containing_type) +HANDLE_DIE_HASH_ATTR(DW_AT_count) +HANDLE_DIE_HASH_ATTR(DW_AT_data_bit_offset) +HANDLE_DIE_HASH_ATTR(DW_AT_data_location) +HANDLE_DIE_HASH_ATTR(DW_AT_data_member_location) +HANDLE_DIE_HASH_ATTR(DW_AT_decimal_scale) +HANDLE_DIE_HASH_ATTR(DW_AT_decimal_sign) +HANDLE_DIE_HASH_ATTR(DW_AT_default_value) +HANDLE_DIE_HASH_ATTR(DW_AT_digit_count) +HANDLE_DIE_HASH_ATTR(DW_AT_discr) +HANDLE_DIE_HASH_ATTR(DW_AT_discr_list) +HANDLE_DIE_HASH_ATTR(DW_AT_discr_value) +HANDLE_DIE_HASH_ATTR(DW_AT_encoding) +HANDLE_DIE_HASH_ATTR(DW_AT_enum_class) +HANDLE_DIE_HASH_ATTR(DW_AT_endianity) +HANDLE_DIE_HASH_ATTR(DW_AT_explicit) +HANDLE_DIE_HASH_ATTR(DW_AT_is_optional) +HANDLE_DIE_HASH_ATTR(DW_AT_location) +HANDLE_DIE_HASH_ATTR(DW_AT_lower_bound) +HANDLE_DIE_HASH_ATTR(DW_AT_mutable) +HANDLE_DIE_HASH_ATTR(DW_AT_ordering) +HANDLE_DIE_HASH_ATTR(DW_AT_picture_string) +HANDLE_DIE_HASH_ATTR(DW_AT_prototyped) +HANDLE_DIE_HASH_ATTR(DW_AT_small) +HANDLE_DIE_HASH_ATTR(DW_AT_segment) +HANDLE_DIE_HASH_ATTR(DW_AT_string_length) +HANDLE_DIE_HASH_ATTR(DW_AT_threads_scaled) +HANDLE_DIE_HASH_ATTR(DW_AT_upper_bound) +HANDLE_DIE_HASH_ATTR(DW_AT_use_location) +HANDLE_DIE_HASH_ATTR(DW_AT_use_UTF8) +HANDLE_DIE_HASH_ATTR(DW_AT_variable_parameter) +HANDLE_DIE_HASH_ATTR(DW_AT_virtuality) +HANDLE_DIE_HASH_ATTR(DW_AT_visibility) +HANDLE_DIE_HASH_ATTR(DW_AT_vtable_elem_location) +HANDLE_DIE_HASH_ATTR(DW_AT_type) + +#undef HANDLE_DIE_HASH_ATTR diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp index 22fd7bb..c2ad9db 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp @@ -194,6 +194,10 @@ void llvm::calculateDbgValueHistory(const MachineFunction *MF, // some variables. for (const MachineOperand &MO : MI.operands()) { if (MO.isReg() && MO.isDef() && MO.getReg()) { + // Ignore call instructions that claim to clobber SP. The AArch64 + // backend does this for aggregate function arguments. + if (MI.isCall() && MO.getReg() == SP) + continue; // If this is a virtual register, only clobber it since it doesn't // have aliases. if (TRI->isVirtualRegister(MO.getReg())) @@ -209,8 +213,7 @@ void llvm::calculateDbgValueHistory(const MachineFunction *MF, } else if (MO.isRegMask()) { // If this is a register mask operand, clobber all debug values in // non-CSRs. - for (int I = ChangingRegs.find_first(); I != -1; - I = ChangingRegs.find_next(I)) { + for (unsigned I : ChangingRegs.set_bits()) { // Don't consider SP to be clobbered by register masks. if (unsigned(I) != SP && TRI->isPhysicalRegister(I) && MO.clobbersPhysReg(I)) { diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp index 9419098..0971c59 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp @@ -115,12 +115,35 @@ uint64_t DebugHandlerBase::getBaseTypeSize(const DITypeRef TyRef) { return getBaseTypeSize(BaseType); } +static bool hasDebugInfo(const MachineModuleInfo *MMI, + const MachineFunction *MF) { + if (!MMI->hasDebugInfo()) + return false; + auto *SP = MF->getFunction()->getSubprogram(); + if (!SP) + return false; + assert(SP->getUnit()); + auto EK = SP->getUnit()->getEmissionKind(); + if (EK == DICompileUnit::NoDebug) + return false; + return true; +} + void DebugHandlerBase::beginFunction(const MachineFunction *MF) { + PrevInstBB = nullptr; + + if (!Asm || !hasDebugInfo(MMI, MF)) { + skippedNonDebugFunction(); + return; + } + // Grab the lexical scopes for the function, if we don't have any of those // then we're not going to be able to do anything. LScopes.initialize(*MF); - if (LScopes.empty()) + if (LScopes.empty()) { + beginFunctionImpl(MF); return; + } // Make sure that each lexical scope will have a begin/end label. identifyScopeMarkers(); @@ -167,6 +190,7 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) { PrevInstLoc = DebugLoc(); PrevLabel = Asm->getFunctionBegin(); + beginFunctionImpl(MF); } void DebugHandlerBase::beginInstruction(const MachineInstr *MI) { @@ -200,9 +224,9 @@ void DebugHandlerBase::endInstruction() { return; assert(CurMI != nullptr); - // Don't create a new label after DBG_VALUE instructions. - // They don't generate code. - if (!CurMI->isDebugValue()) { + // Don't create a new label after DBG_VALUE and other instructions that don't + // generate code. + if (!CurMI->isMetaInstruction()) { PrevLabel = nullptr; PrevInstBB = CurMI->getParent(); } @@ -228,6 +252,8 @@ void DebugHandlerBase::endInstruction() { } void DebugHandlerBase::endFunction(const MachineFunction *MF) { + if (hasDebugInfo(MMI, MF)) + endFunctionImpl(MF); DbgValues.clear(); LabelsBeforeInsn.clear(); LabelsAfterInsn.clear(); diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.h index c00fa18..659a921 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.h @@ -80,6 +80,10 @@ protected: LabelsAfterInsn.insert(std::make_pair(MI, nullptr)); } + virtual void beginFunctionImpl(const MachineFunction *MF) = 0; + virtual void endFunctionImpl(const MachineFunction *MF) = 0; + virtual void skippedNonDebugFunction() {} + // AsmPrinterHandler overrides. public: void beginInstruction(const MachineInstr *MI) override; diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h index 36fb150..a68e8cc 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h @@ -76,7 +76,8 @@ public: const DIExpression *getExpression() const { return Expression; } friend bool operator==(const Value &, const Value &); friend bool operator<(const Value &, const Value &); - void dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD void dump() const { if (isLocation()) { llvm::dbgs() << "Loc = { reg=" << Loc.getReg() << " "; if (Loc.isIndirect()) @@ -90,6 +91,7 @@ public: if (Expression) Expression->dump(); } +#endif }; private: diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h index 3656e9d..0c551df 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h @@ -10,9 +10,9 @@ #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCSTREAM_H #define LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCSTREAM_H +#include "ByteStreamer.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" -#include "ByteStreamer.h" namespace llvm { diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.h index 05ac1cb..b1ef8cf 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.h @@ -16,12 +16,12 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringMap.h" +#include "llvm/BinaryFormat/Dwarf.h" #include "llvm/CodeGen/DIE.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/Dwarf.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/FormattedStream.h" diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp index e08306b..dd7f793 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp @@ -14,6 +14,7 @@ #include "DwarfException.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Twine.h" +#include "llvm/BinaryFormat/Dwarf.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -28,7 +29,6 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MachineLocation.h" -#include "llvm/Support/Dwarf.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Target/TargetFrameLowering.h" diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index d904372..676c48f 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -1,3 +1,16 @@ +//===-- llvm/CodeGen/DwarfCompileUnit.cpp - Dwarf Compile Units -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains support for constructing a dwarf compile unit. +// +//===----------------------------------------------------------------------===// + #include "DwarfCompileUnit.h" #include "DwarfExpression.h" #include "llvm/CodeGen/MachineFunction.h" @@ -129,67 +142,72 @@ DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE( bool addToAccelTable = false; DIELoc *Loc = nullptr; std::unique_ptr<DIEDwarfExpression> DwarfExpr; - bool AllConstant = std::all_of( - GlobalExprs.begin(), GlobalExprs.end(), - [&](const GlobalExpr GE) { - return GE.Expr && GE.Expr->isConstant(); - }); - for (const auto &GE : GlobalExprs) { const GlobalVariable *Global = GE.Var; const DIExpression *Expr = GE.Expr; + // For compatibility with DWARF 3 and earlier, // DW_AT_location(DW_OP_constu, X, DW_OP_stack_value) becomes // DW_AT_const_value(X). if (GlobalExprs.size() == 1 && Expr && Expr->isConstant()) { + addToAccelTable = true; addConstantValue(*VariableDIE, /*Unsigned=*/true, Expr->getElement(1)); - // We cannot describe the location of dllimport'd variables: the - // computation of their address requires loads from the IAT. - } else if ((Global && !Global->hasDLLImportStorageClass()) || AllConstant) { - if (!Loc) { - Loc = new (DIEValueAllocator) DIELoc; - DwarfExpr = llvm::make_unique<DIEDwarfExpression>(*Asm, *this, *Loc); - } + break; + } + + // We cannot describe the location of dllimport'd variables: the + // computation of their address requires loads from the IAT. + if (Global && Global->hasDLLImportStorageClass()) + continue; + + // Nothing to describe without address or constant. + if (!Global && (!Expr || !Expr->isConstant())) + continue; + + if (!Loc) { addToAccelTable = true; - if (Global) { - const MCSymbol *Sym = Asm->getSymbol(Global); - if (Global->isThreadLocal()) { - if (Asm->TM.Options.EmulatedTLS) { - // TODO: add debug info for emulated thread local mode. - } else { - // FIXME: Make this work with -gsplit-dwarf. - unsigned PointerSize = Asm->getDataLayout().getPointerSize(); - assert((PointerSize == 4 || PointerSize == 8) && - "Add support for other sizes if necessary"); - // Based on GCC's support for TLS: - if (!DD->useSplitDwarf()) { - // 1) Start with a constNu of the appropriate pointer size - addUInt(*Loc, dwarf::DW_FORM_data1, - PointerSize == 4 ? dwarf::DW_OP_const4u - : dwarf::DW_OP_const8u); - // 2) containing the (relocated) offset of the TLS variable - // within the module's TLS block. - addExpr(*Loc, dwarf::DW_FORM_udata, - Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym)); - } else { - addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index); - addUInt(*Loc, dwarf::DW_FORM_udata, - DD->getAddressPool().getIndex(Sym, /* TLS */ true)); - } - // 3) followed by an OP to make the debugger do a TLS lookup. + Loc = new (DIEValueAllocator) DIELoc; + DwarfExpr = llvm::make_unique<DIEDwarfExpression>(*Asm, *this, *Loc); + } + + if (Global) { + const MCSymbol *Sym = Asm->getSymbol(Global); + if (Global->isThreadLocal()) { + if (Asm->TM.Options.EmulatedTLS) { + // TODO: add debug info for emulated thread local mode. + } else { + // FIXME: Make this work with -gsplit-dwarf. + unsigned PointerSize = Asm->getDataLayout().getPointerSize(); + assert((PointerSize == 4 || PointerSize == 8) && + "Add support for other sizes if necessary"); + // Based on GCC's support for TLS: + if (!DD->useSplitDwarf()) { + // 1) Start with a constNu of the appropriate pointer size addUInt(*Loc, dwarf::DW_FORM_data1, - DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address - : dwarf::DW_OP_form_tls_address); + PointerSize == 4 ? dwarf::DW_OP_const4u + : dwarf::DW_OP_const8u); + // 2) containing the (relocated) offset of the TLS variable + // within the module's TLS block. + addExpr(*Loc, dwarf::DW_FORM_udata, + Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym)); + } else { + addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index); + addUInt(*Loc, dwarf::DW_FORM_udata, + DD->getAddressPool().getIndex(Sym, /* TLS */ true)); } - } else { - DD->addArangeLabel(SymbolCU(this, Sym)); - addOpAddress(*Loc, Sym); + // 3) followed by an OP to make the debugger do a TLS lookup. + addUInt(*Loc, dwarf::DW_FORM_data1, + DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address + : dwarf::DW_OP_form_tls_address); } + } else { + DD->addArangeLabel(SymbolCU(this, Sym)); + addOpAddress(*Loc, Sym); } - if (Expr) { - DwarfExpr->addFragmentOffset(Expr); - DwarfExpr->AddExpression(Expr); - } + } + if (Expr) { + DwarfExpr->addFragmentOffset(Expr); + DwarfExpr->addExpression(Expr); } } if (Loc) @@ -227,17 +245,6 @@ void DwarfCompileUnit::addRange(RangeSpan Range) { CURanges.back().setEnd(Range.getEnd()); } -DIE::value_iterator -DwarfCompileUnit::addSectionLabel(DIE &Die, dwarf::Attribute Attribute, - const MCSymbol *Label, const MCSymbol *Sec) { - if (Asm->MAI->doesDwarfUseRelocationsAcrossSections()) - return addLabel(Die, Attribute, - DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset - : dwarf::DW_FORM_data4, - Label); - return addSectionDelta(Die, Attribute, Label, Sec); -} - void DwarfCompileUnit::initStmtList() { // Define start line table label for each Compile Unit. MCSymbol *LineTableStartSym = @@ -362,15 +369,6 @@ void DwarfCompileUnit::constructScopeDIE( FinalChildren.push_back(std::move(ScopeDIE)); } -DIE::value_iterator -DwarfCompileUnit::addSectionDelta(DIE &Die, dwarf::Attribute Attribute, - const MCSymbol *Hi, const MCSymbol *Lo) { - return Die.addValue(DIEValueAllocator, Attribute, - DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset - : dwarf::DW_FORM_data4, - new (DIEValueAllocator) DIEDelta(Hi, Lo)); -} - void DwarfCompileUnit::addScopeRangeList(DIE &ScopeDIE, SmallVector<RangeSpan, 2> Range) { const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering(); @@ -422,7 +420,7 @@ DIE *DwarfCompileUnit::constructInlinedScopeDIE(LexicalScope *Scope) { auto *InlinedSP = getDISubprogram(DS); // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram // was inlined from another compile unit. - DIE *OriginDIE = DU->getAbstractSPDies()[InlinedSP]; + DIE *OriginDIE = getAbstractSPDies()[InlinedSP]; assert(OriginDIE && "Unable to find original DIE for an inlined subprogram."); auto ScopeDIE = DIE::get(DIEValueAllocator, dwarf::DW_TAG_inlined_subroutine); @@ -507,8 +505,8 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV, DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc); // If there is an expression, emit raw unsigned bytes. DwarfExpr.addFragmentOffset(Expr); - DwarfExpr.AddUnsignedConstant(DVInsn->getOperand(0).getImm()); - DwarfExpr.AddExpression(Expr); + DwarfExpr.addUnsignedConstant(DVInsn->getOperand(0).getImm()); + DwarfExpr.addExpression(Expr); addBlock(*VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize()); } else addConstantValue(*VariableDie, DVInsn->getOperand(0), DV.getType()); @@ -529,12 +527,19 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV, DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc); for (auto &Fragment : DV.getFrameIndexExprs()) { unsigned FrameReg = 0; + const DIExpression *Expr = Fragment.Expr; const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering(); int Offset = TFI->getFrameIndexReference(*Asm->MF, Fragment.FI, FrameReg); - DwarfExpr.addFragmentOffset(Fragment.Expr); - DwarfExpr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(), - FrameReg, Offset); - DwarfExpr.AddExpression(Fragment.Expr); + DwarfExpr.addFragmentOffset(Expr); + SmallVector<uint64_t, 8> Ops; + Ops.push_back(dwarf::DW_OP_plus_uconst); + Ops.push_back(Offset); + Ops.append(Expr->elements_begin(), Expr->elements_end()); + DIExpressionCursor Cursor(Ops); + DwarfExpr.setMemoryLocationKind(); + DwarfExpr.addMachineRegExpression( + *Asm->MF->getSubtarget().getRegisterInfo(), Cursor, FrameReg); + DwarfExpr.addExpression(std::move(Cursor)); } addBlock(*VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize()); @@ -609,7 +614,7 @@ DIE *DwarfCompileUnit::createAndAddScopeChildren(LexicalScope *Scope, void DwarfCompileUnit::constructAbstractSubprogramScopeDIE( LexicalScope *Scope) { - DIE *&AbsDef = DU->getAbstractSPDies()[Scope->getScopeNode()]; + DIE *&AbsDef = getAbstractSPDies()[Scope->getScopeNode()]; if (AbsDef) return; @@ -659,8 +664,9 @@ DIE *DwarfCompileUnit::constructImportedEntityDIE( else EntityDie = getDIE(Entity); assert(EntityDie); - addSourceLine(*IMDie, Module->getLine(), Module->getScope()->getFilename(), - Module->getScope()->getDirectory()); + auto *File = Module->getFile(); + addSourceLine(*IMDie, Module->getLine(), File ? File->getFilename() : "", + File ? File->getDirectory() : ""); addDIEEntry(*IMDie, dwarf::DW_AT_import, *EntityDie); StringRef Name = Module->getName(); if (!Name.empty()) @@ -671,7 +677,7 @@ DIE *DwarfCompileUnit::constructImportedEntityDIE( void DwarfCompileUnit::finishSubprogramDefinition(const DISubprogram *SP) { DIE *D = getDIE(SP); - if (DIE *AbsSPDIE = DU->getAbstractSPDies().lookup(SP)) { + if (DIE *AbsSPDIE = getAbstractSPDies().lookup(SP)) { if (D) // If this subprogram has an abstract definition, reference that addDIEEntry(*D, dwarf::DW_AT_abstract_origin, *AbsSPDIE); @@ -683,6 +689,42 @@ void DwarfCompileUnit::finishSubprogramDefinition(const DISubprogram *SP) { } } +void DwarfCompileUnit::finishVariableDefinition(const DbgVariable &Var) { + DbgVariable *AbsVar = getExistingAbstractVariable( + InlinedVariable(Var.getVariable(), Var.getInlinedAt())); + auto *VariableDie = Var.getDIE(); + if (AbsVar && AbsVar->getDIE()) { + addDIEEntry(*VariableDie, dwarf::DW_AT_abstract_origin, + *AbsVar->getDIE()); + } else + applyVariableAttributes(Var, *VariableDie); +} + +DbgVariable *DwarfCompileUnit::getExistingAbstractVariable(InlinedVariable IV) { + const DILocalVariable *Cleansed; + return getExistingAbstractVariable(IV, Cleansed); +} + +// Find abstract variable, if any, associated with Var. +DbgVariable *DwarfCompileUnit::getExistingAbstractVariable( + InlinedVariable IV, const DILocalVariable *&Cleansed) { + // More then one inlined variable corresponds to one abstract variable. + Cleansed = IV.first; + auto &AbstractVariables = getAbstractVariables(); + auto I = AbstractVariables.find(Cleansed); + if (I != AbstractVariables.end()) + return I->second.get(); + return nullptr; +} + +void DwarfCompileUnit::createAbstractVariable(const DILocalVariable *Var, + LexicalScope *Scope) { + assert(Scope && Scope->isAbstractScope()); + auto AbsDbgVariable = make_unique<DbgVariable>(Var, /* IA */ nullptr); + DU->addScopeVariable(Scope, AbsDbgVariable.get()); + getAbstractVariables()[Var] = std::move(AbsDbgVariable); +} + void DwarfCompileUnit::emitHeader(bool UseOffsets) { // Don't bother labeling the .dwo unit, as its offset isn't used. if (!Skeleton) { @@ -690,27 +732,54 @@ void DwarfCompileUnit::emitHeader(bool UseOffsets) { Asm->OutStreamer->EmitLabel(LabelBegin); } - DwarfUnit::emitHeader(UseOffsets); + dwarf::UnitType UT = Skeleton ? dwarf::DW_UT_split_compile + : DD->useSplitDwarf() ? dwarf::DW_UT_skeleton + : dwarf::DW_UT_compile; + DwarfUnit::emitCommonHeader(UseOffsets, UT); } /// addGlobalName - Add a new global name to the compile unit. -void DwarfCompileUnit::addGlobalName(StringRef Name, DIE &Die, +void DwarfCompileUnit::addGlobalName(StringRef Name, const DIE &Die, const DIScope *Context) { - if (includeMinimalInlineScopes()) + if (!DD->hasDwarfPubSections(includeMinimalInlineScopes())) return; std::string FullName = getParentContextString(Context) + Name.str(); GlobalNames[FullName] = &Die; } +void DwarfCompileUnit::addGlobalNameForTypeUnit(StringRef Name, + const DIScope *Context) { + if (!DD->hasDwarfPubSections(includeMinimalInlineScopes())) + return; + std::string FullName = getParentContextString(Context) + Name.str(); + // Insert, allowing the entry to remain as-is if it's already present + // This way the CU-level type DIE is preferred over the "can't describe this + // type as a unit offset because it's not really in the CU at all, it's only + // in a type unit" + GlobalNames.insert(std::make_pair(std::move(FullName), &getUnitDie())); +} + /// Add a new global type to the unit. void DwarfCompileUnit::addGlobalType(const DIType *Ty, const DIE &Die, const DIScope *Context) { - if (includeMinimalInlineScopes()) + if (!DD->hasDwarfPubSections(includeMinimalInlineScopes())) return; std::string FullName = getParentContextString(Context) + Ty->getName().str(); GlobalTypes[FullName] = &Die; } +void DwarfCompileUnit::addGlobalTypeUnitType(const DIType *Ty, + const DIScope *Context) { + if (!DD->hasDwarfPubSections(includeMinimalInlineScopes())) + return; + std::string FullName = getParentContextString(Context) + Ty->getName().str(); + // Insert, allowing the entry to remain as-is if it's already present + // This way the CU-level type DIE is preferred over the "can't describe this + // type as a unit offset because it's not really in the CU at all, it's only + // in a type unit" + GlobalTypes.insert(std::make_pair(std::move(FullName), &getUnitDie())); +} + /// addVariableAddress - Add DW_AT_location attribute for a /// DbgVariable based on provided MachineLocation. void DwarfCompileUnit::addVariableAddress(const DbgVariable &DV, DIE &Die, @@ -727,22 +796,23 @@ void DwarfCompileUnit::addVariableAddress(const DbgVariable &DV, DIE &Die, void DwarfCompileUnit::addAddress(DIE &Die, dwarf::Attribute Attribute, const MachineLocation &Location) { DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression Expr(*Asm, *this, *Loc); - - bool validReg; - if (Location.isReg()) - validReg = Expr.AddMachineReg(*Asm->MF->getSubtarget().getRegisterInfo(), - Location.getReg()); - else - validReg = - Expr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(), - Location.getReg(), Location.getOffset()); + DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc); + if (Location.isIndirect()) + DwarfExpr.setMemoryLocationKind(); - if (!validReg) + SmallVector<uint64_t, 8> Ops; + if (Location.isIndirect() && Location.getOffset()) { + Ops.push_back(dwarf::DW_OP_plus_uconst); + Ops.push_back(Location.getOffset()); + } + DIExpressionCursor Cursor(Ops); + const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo(); + if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg())) return; + DwarfExpr.addExpression(std::move(Cursor)); // Now attach the location information to the DIE. - addBlock(Die, Attribute, Expr.finalize()); + addBlock(Die, Attribute, DwarfExpr.finalize()); } /// Start with the address based on the location provided, and generate the @@ -754,23 +824,25 @@ void DwarfCompileUnit::addComplexAddress(const DbgVariable &DV, DIE &Die, const MachineLocation &Location) { DIELoc *Loc = new (DIEValueAllocator) DIELoc; DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc); - const DIExpression *Expr = DV.getSingleExpression(); - DIExpressionCursor ExprCursor(Expr); + const DIExpression *DIExpr = DV.getSingleExpression(); + DwarfExpr.addFragmentOffset(DIExpr); + if (Location.isIndirect()) + DwarfExpr.setMemoryLocationKind(); + + SmallVector<uint64_t, 8> Ops; + if (Location.isIndirect() && Location.getOffset()) { + Ops.push_back(dwarf::DW_OP_plus_uconst); + Ops.push_back(Location.getOffset()); + } + Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()); + DIExpressionCursor Cursor(Ops); const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo(); - auto Reg = Location.getReg(); - DwarfExpr.addFragmentOffset(Expr); - bool ValidReg = - Location.getOffset() - ? DwarfExpr.AddMachineRegIndirect(TRI, Reg, Location.getOffset()) - : DwarfExpr.AddMachineRegExpression(TRI, ExprCursor, Reg); - - if (!ValidReg) + if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg())) return; - - DwarfExpr.AddExpression(std::move(ExprCursor)); + DwarfExpr.addExpression(std::move(Cursor)); // Now attach the location information to the DIE. - addBlock(Die, Attribute, Loc); + addBlock(Die, Attribute, DwarfExpr.finalize()); } /// Add a Dwarf loclistptr attribute data and value. diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h index a8025f1..e386727 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h @@ -15,8 +15,8 @@ #define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H #include "DwarfUnit.h" +#include "llvm/BinaryFormat/Dwarf.h" #include "llvm/IR/DebugInfo.h" -#include "llvm/Support/Dwarf.h" namespace llvm { @@ -28,7 +28,7 @@ class DwarfFile; class MCSymbol; class LexicalScope; -class DwarfCompileUnit : public DwarfUnit { +class DwarfCompileUnit final : public DwarfUnit { /// A numeric ID unique among all CUs in the module unsigned UniqueID; @@ -68,13 +68,26 @@ class DwarfCompileUnit : public DwarfUnit { // ranges/locs. const MCSymbol *BaseAddress; + DenseMap<const MDNode *, DIE *> AbstractSPDies; + DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables; + /// \brief Construct a DIE for the given DbgVariable without initializing the /// DbgVariable's DIE reference. DIE *constructVariableDIEImpl(const DbgVariable &DV, bool Abstract); bool isDwoUnit() const override; - bool includeMinimalInlineScopes() const; + DenseMap<const MDNode *, DIE *> &getAbstractSPDies() { + if (isDwoUnit() && !DD->shareAcrossDWOCUs()) + return AbstractSPDies; + return DU->getAbstractSPDies(); + } + + DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> &getAbstractVariables() { + if (isDwoUnit() && !DD->shareAcrossDWOCUs()) + return AbstractVariables; + return DU->getAbstractVariables(); + } public: DwarfCompileUnit(unsigned UID, const DICompileUnit *Node, AsmPrinter *A, @@ -86,6 +99,8 @@ public: return Skeleton; } + bool includeMinimalInlineScopes() const; + void initStmtList(); /// Apply the DW_AT_stmt_list from this compile unit to the specified DIE. @@ -112,10 +127,6 @@ public: void addLocalLabelAddress(DIE &Die, dwarf::Attribute Attribute, const MCSymbol *Label); - /// addSectionDelta - Add a label delta attribute data and value. - DIE::value_iterator addSectionDelta(DIE &Die, dwarf::Attribute Attribute, - const MCSymbol *Hi, const MCSymbol *Lo); - DwarfCompileUnit &getCU() override { return *this; } unsigned getOrCreateSourceID(StringRef FileName, StringRef DirName) override; @@ -136,12 +147,6 @@ public: void attachLowHighPC(DIE &D, const MCSymbol *Begin, const MCSymbol *End); - /// addSectionLabel - Add a Dwarf section label attribute data and value. - /// - DIE::value_iterator addSectionLabel(DIE &Die, dwarf::Attribute Attribute, - const MCSymbol *Label, - const MCSymbol *Sec); - /// \brief Find DIE for the given subprogram and attach appropriate /// DW_AT_low_pc and DW_AT_high_pc attributes. If there are global /// variables in this scope then create and insert DIEs for these @@ -189,6 +194,13 @@ public: DIE *constructImportedEntityDIE(const DIImportedEntity *Module); void finishSubprogramDefinition(const DISubprogram *SP); + void finishVariableDefinition(const DbgVariable &Var); + /// Find abstract variable associated with Var. + typedef DbgValueHistoryMap::InlinedVariable InlinedVariable; + DbgVariable *getExistingAbstractVariable(InlinedVariable IV, + const DILocalVariable *&Cleansed); + DbgVariable *getExistingAbstractVariable(InlinedVariable IV); + void createAbstractVariable(const DILocalVariable *DV, LexicalScope *Scope); /// Set the skeleton unit associated with this unit. void setSkeleton(DwarfCompileUnit &Skel) { Skeleton = &Skel; } @@ -210,12 +222,19 @@ public: } /// Add a new global name to the compile unit. - void addGlobalName(StringRef Name, DIE &Die, const DIScope *Context) override; + void addGlobalName(StringRef Name, const DIE &Die, + const DIScope *Context) override; + + /// Add a new global name present in a type unit to this compile unit. + void addGlobalNameForTypeUnit(StringRef Name, const DIScope *Context); /// Add a new global type to the compile unit. void addGlobalType(const DIType *Ty, const DIE &Die, const DIScope *Context) override; + /// Add a new global type present in a type unit to this compile unit. + void addGlobalTypeUnitType(const DIType *Ty, const DIScope *Context); + const StringMap<const DIE *> &getGlobalNames() const { return GlobalNames; } const StringMap<const DIE *> &getGlobalTypes() const { return GlobalTypes; } diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 91a3d09..f1b4d9f 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Triple.h" +#include "llvm/BinaryFormat/Dwarf.h" #include "llvm/CodeGen/DIE.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineModuleInfo.h" @@ -38,8 +39,6 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/Dwarf.h" -#include "llvm/Support/Endian.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/LEB128.h" @@ -72,6 +71,10 @@ static cl::opt<bool> GenerateARangeSection("generate-arange-section", cl::desc("Generate dwarf aranges"), cl::init(false)); +static cl::opt<bool> SplitDwarfCrossCuReferences( + "split-dwarf-cross-cu-references", cl::Hidden, + cl::desc("Enable cross-cu references in DWO files"), cl::init(false)); + namespace { enum DefaultOnOff { Default, Enable, Disable }; } @@ -92,14 +95,6 @@ DwarfAccelTables("dwarf-accel-tables", cl::Hidden, cl::init(Default)); static cl::opt<DefaultOnOff> -SplitDwarf("split-dwarf", cl::Hidden, - cl::desc("Output DWARF5 split debug info."), - cl::values(clEnumVal(Default, "Default for platform"), - clEnumVal(Enable, "Enabled"), - clEnumVal(Disable, "Disabled")), - cl::init(Default)); - -static cl::opt<DefaultOnOff> DwarfPubSections("generate-dwarf-pub-sections", cl::Hidden, cl::desc("Generate DWARF pubnames and pubtypes sections"), cl::values(clEnumVal(Default, "Default for platform"), @@ -127,17 +122,17 @@ static const char *const DWARFGroupDescription = "DWARF Emission"; static const char *const DbgTimerName = "writer"; static const char *const DbgTimerDescription = "DWARF Debug Writer"; -void DebugLocDwarfExpression::EmitOp(uint8_t Op, const char *Comment) { +void DebugLocDwarfExpression::emitOp(uint8_t Op, const char *Comment) { BS.EmitInt8( Op, Comment ? Twine(Comment) + " " + dwarf::OperationEncodingString(Op) : dwarf::OperationEncodingString(Op)); } -void DebugLocDwarfExpression::EmitSigned(int64_t Value) { +void DebugLocDwarfExpression::emitSigned(int64_t Value) { BS.EmitSLEB128(Value, Twine(Value)); } -void DebugLocDwarfExpression::EmitUnsigned(uint64_t Value) { +void DebugLocDwarfExpression::emitUnsigned(uint64_t Value) { BS.EmitULEB128(Value, Twine(Value)); } @@ -200,6 +195,12 @@ const DIType *DbgVariable::getType() const { } ArrayRef<DbgVariable::FrameIndexExpr> DbgVariable::getFrameIndexExprs() const { + if (FrameIndexExprs.size() == 1) + return FrameIndexExprs; + + assert(all_of(FrameIndexExprs, + [](const FrameIndexExpr &A) { return A.Expr->isFragment(); }) && + "multiple FI expressions without DW_OP_LLVM_fragment"); std::sort(FrameIndexExprs.begin(), FrameIndexExprs.end(), [](const FrameIndexExpr &A, const FrameIndexExpr &B) -> bool { return A.Expr->getFragmentInfo()->OffsetInBits < @@ -248,17 +249,8 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M) HasAppleExtensionAttributes = tuneForLLDB(); - // Handle split DWARF. Off by default for now. - if (SplitDwarf == Default) - HasSplitDwarf = false; - else - HasSplitDwarf = SplitDwarf == Enable; - - // Pubnames/pubtypes on by default for GDB. - if (DwarfPubSections == Default) - HasDwarfPubSections = tuneForGDB(); - else - HasDwarfPubSections = DwarfPubSections == Enable; + // Handle split DWARF. + HasSplitDwarf = !Asm->TM.Options.MCOptions.SplitDwarfFile.empty(); // SCE defaults to linkage names only for abstract subprograms. if (DwarfLinkageNames == DefaultLinkageNames) @@ -368,25 +360,49 @@ template <typename Func> static void forBothCUs(DwarfCompileUnit &CU, Func F) { F(*SkelCU); } -void DwarfDebug::constructAbstractSubprogramScopeDIE(LexicalScope *Scope) { +bool DwarfDebug::shareAcrossDWOCUs() const { + return SplitDwarfCrossCuReferences; +} + +void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU, + LexicalScope *Scope) { assert(Scope && Scope->getScopeNode()); assert(Scope->isAbstractScope()); assert(!Scope->getInlinedAt()); auto *SP = cast<DISubprogram>(Scope->getScopeNode()); - ProcessedSPNodes.insert(SP); - // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram // was inlined from another compile unit. - auto &CU = *CUMap.lookup(SP->getUnit()); - forBothCUs(CU, [&](DwarfCompileUnit &CU) { - CU.constructAbstractSubprogramScopeDIE(Scope); - }); + if (useSplitDwarf() && !shareAcrossDWOCUs() && !SP->getUnit()->getSplitDebugInlining()) + // Avoid building the original CU if it won't be used + SrcCU.constructAbstractSubprogramScopeDIE(Scope); + else { + auto &CU = getOrCreateDwarfCompileUnit(SP->getUnit()); + if (auto *SkelCU = CU.getSkeleton()) { + (shareAcrossDWOCUs() ? CU : SrcCU) + .constructAbstractSubprogramScopeDIE(Scope); + if (CU.getCUNode()->getSplitDebugInlining()) + SkelCU->constructAbstractSubprogramScopeDIE(Scope); + } else + CU.constructAbstractSubprogramScopeDIE(Scope); + } +} + +bool DwarfDebug::hasDwarfPubSections(bool includeMinimalInlineScopes) const { + // Opting in to GNU Pubnames/types overrides the default to ensure these are + // generated for things like Gold's gdb_index generation. + if (GenerateGnuPubSections) + return true; + + if (DwarfPubSections == Default) + return tuneForGDB() && !includeMinimalInlineScopes; + + return DwarfPubSections == Enable; } -void DwarfDebug::addGnuPubAttributes(DwarfUnit &U, DIE &D) const { - if (!GenerateGnuPubSections) +void DwarfDebug::addGnuPubAttributes(DwarfCompileUnit &U, DIE &D) const { + if (!hasDwarfPubSections(U.includeMinimalInlineScopes())) return; U.addFlag(D, dwarf::DW_AT_GNU_pubnames); @@ -395,7 +411,9 @@ void DwarfDebug::addGnuPubAttributes(DwarfUnit &U, DIE &D) const { // Create new DwarfCompileUnit for the given metadata node with tag // DW_TAG_compile_unit. DwarfCompileUnit & -DwarfDebug::constructDwarfCompileUnit(const DICompileUnit *DIUnit) { +DwarfDebug::getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit) { + if (auto *CU = CUMap.lookup(DIUnit)) + return *CU; StringRef FN = DIUnit->getFilename(); CompilationDir = DIUnit->getDirectory(); @@ -407,7 +425,7 @@ DwarfDebug::constructDwarfCompileUnit(const DICompileUnit *DIUnit) { if (useSplitDwarf()) { NewCU.setSkeleton(constructSkeletonCU(NewCU)); NewCU.addString(Die, dwarf::DW_AT_GNU_dwo_name, - DIUnit->getSplitDebugFilename()); + Asm->TM.Options.MCOptions.SplitDwarfFile); } // LTO with assembly output shares a single line table amongst multiple CUs. @@ -418,7 +436,14 @@ DwarfDebug::constructDwarfCompileUnit(const DICompileUnit *DIUnit) { Asm->OutStreamer->getContext().setMCLineTableCompilationDir( NewCU.getUniqueID(), CompilationDir); - NewCU.addString(Die, dwarf::DW_AT_producer, DIUnit->getProducer()); + StringRef Producer = DIUnit->getProducer(); + StringRef Flags = DIUnit->getFlags(); + if (!Flags.empty()) { + std::string ProducerWithFlags = Producer.str() + " " + Flags.str(); + NewCU.addString(Die, dwarf::DW_AT_producer, ProducerWithFlags); + } else + NewCU.addString(Die, dwarf::DW_AT_producer, Producer); + NewCU.addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2, DIUnit->getSourceLanguage()); NewCU.addString(Die, dwarf::DW_AT_name, FN); @@ -521,7 +546,12 @@ void DwarfDebug::beginModule() { } for (DICompileUnit *CUNode : M->debug_compile_units()) { - DwarfCompileUnit &CU = constructDwarfCompileUnit(CUNode); + if (CUNode->getEnumTypes().empty() && CUNode->getRetainedTypes().empty() && + CUNode->getGlobalVariables().empty() && + CUNode->getImportedEntities().empty() && CUNode->getMacros().empty()) + continue; + + DwarfCompileUnit &CU = getOrCreateDwarfCompileUnit(CUNode); for (auto *IE : CUNode->getImportedEntities()) CU.addImportedEntity(IE); @@ -544,7 +574,6 @@ void DwarfDebug::beginModule() { // The retained types array by design contains pointers to // MDNodes rather than DIRefs. Unique them here. if (DIType *RT = dyn_cast<DIType>(Ty)) - if (!RT->isExternalTypeRef()) // There is no point in force-emitting a forward declaration. CU.getOrCreateTypeDIE(RT); } @@ -564,22 +593,17 @@ void DwarfDebug::finishVariableDefinitions() { // DIE::getUnit isn't simple - it walks parent pointers, etc. DwarfCompileUnit *Unit = CUDieMap.lookup(VariableDie->getUnitDie()); assert(Unit); - DbgVariable *AbsVar = getExistingAbstractVariable( - InlinedVariable(Var->getVariable(), Var->getInlinedAt())); - if (AbsVar && AbsVar->getDIE()) { - Unit->addDIEEntry(*VariableDie, dwarf::DW_AT_abstract_origin, - *AbsVar->getDIE()); - } else - Unit->applyVariableAttributes(*Var, *VariableDie); + Unit->finishVariableDefinition(*Var); } } void DwarfDebug::finishSubprogramDefinitions() { - for (const DISubprogram *SP : ProcessedSPNodes) - if (SP->getUnit()->getEmissionKind() != DICompileUnit::NoDebug) - forBothCUs(*CUMap.lookup(SP->getUnit()), [&](DwarfCompileUnit &CU) { - CU.finishSubprogramDefinition(SP); - }); + for (const DISubprogram *SP : ProcessedSPNodes) { + assert(SP->getUnit()->getEmissionKind() != DICompileUnit::NoDebug); + forBothCUs( + getOrCreateDwarfCompileUnit(SP->getUnit()), + [&](DwarfCompileUnit &CU) { CU.finishSubprogramDefinition(SP); }); + } } void DwarfDebug::finalizeModuleInfo() { @@ -589,6 +613,13 @@ void DwarfDebug::finalizeModuleInfo() { finishVariableDefinitions(); + // Include the DWO file name in the hash if there's more than one CU. + // This handles ThinLTO's situation where imported CUs may very easily be + // duplicate with the same CU partially imported into another ThinLTO unit. + StringRef DWOName; + if (CUMap.size() > 1) + DWOName = Asm->TM.Options.MCOptions.SplitDwarfFile; + // Handle anything that needs to be done on a per-unit basis after // all other generation. for (const auto &P : CUMap) { @@ -603,7 +634,8 @@ void DwarfDebug::finalizeModuleInfo() { auto *SkCU = TheCU.getSkeleton(); if (useSplitDwarf()) { // Emit a unique identifier for this CU. - uint64_t ID = DIEHash(Asm).computeCUSignature(TheCU.getUnitDie()); + uint64_t ID = + DIEHash(Asm).computeCUSignature(DWOName, TheCU.getUnitDie()); TheCU.addUInt(TheCU.getUnitDie(), dwarf::DW_AT_GNU_dwo_id, dwarf::DW_FORM_data8, ID); SkCU->addUInt(SkCU->getUnitDie(), dwarf::DW_AT_GNU_dwo_id, @@ -712,63 +744,40 @@ void DwarfDebug::endModule() { } // Emit the pubnames and pubtypes sections if requested. - if (HasDwarfPubSections) { + // The condition is optimistically correct - any CU not using GMLT (& + // implicit/default pubnames state) might still have pubnames. + if (hasDwarfPubSections(/* gmlt */ false)) { emitDebugPubNames(GenerateGnuPubSections); emitDebugPubTypes(GenerateGnuPubSections); } // clean up. - AbstractVariables.clear(); -} - -// Find abstract variable, if any, associated with Var. -DbgVariable * -DwarfDebug::getExistingAbstractVariable(InlinedVariable IV, - const DILocalVariable *&Cleansed) { - // More then one inlined variable corresponds to one abstract variable. - Cleansed = IV.first; - auto I = AbstractVariables.find(Cleansed); - if (I != AbstractVariables.end()) - return I->second.get(); - return nullptr; + // FIXME: AbstractVariables.clear(); } -DbgVariable *DwarfDebug::getExistingAbstractVariable(InlinedVariable IV) { - const DILocalVariable *Cleansed; - return getExistingAbstractVariable(IV, Cleansed); -} - -void DwarfDebug::createAbstractVariable(const DILocalVariable *Var, - LexicalScope *Scope) { - auto AbsDbgVariable = make_unique<DbgVariable>(Var, /* IA */ nullptr); - InfoHolder.addScopeVariable(Scope, AbsDbgVariable.get()); - AbstractVariables[Var] = std::move(AbsDbgVariable); -} - -void DwarfDebug::ensureAbstractVariableIsCreated(InlinedVariable IV, +void DwarfDebug::ensureAbstractVariableIsCreated(DwarfCompileUnit &CU, InlinedVariable IV, const MDNode *ScopeNode) { const DILocalVariable *Cleansed = nullptr; - if (getExistingAbstractVariable(IV, Cleansed)) + if (CU.getExistingAbstractVariable(IV, Cleansed)) return; - createAbstractVariable(Cleansed, LScopes.getOrCreateAbstractScope( + CU.createAbstractVariable(Cleansed, LScopes.getOrCreateAbstractScope( cast<DILocalScope>(ScopeNode))); } -void DwarfDebug::ensureAbstractVariableIsCreatedIfScoped( +void DwarfDebug::ensureAbstractVariableIsCreatedIfScoped(DwarfCompileUnit &CU, InlinedVariable IV, const MDNode *ScopeNode) { const DILocalVariable *Cleansed = nullptr; - if (getExistingAbstractVariable(IV, Cleansed)) + if (CU.getExistingAbstractVariable(IV, Cleansed)) return; if (LexicalScope *Scope = LScopes.findAbstractScope(cast_or_null<DILocalScope>(ScopeNode))) - createAbstractVariable(Cleansed, Scope); + CU.createAbstractVariable(Cleansed, Scope); } - // Collect variable information from side table maintained by MF. void DwarfDebug::collectVariableInfoFromMFTable( - DenseSet<InlinedVariable> &Processed) { + DwarfCompileUnit &TheCU, DenseSet<InlinedVariable> &Processed) { for (const auto &VI : Asm->MF->getVariableDbgInfo()) { if (!VI.Var) continue; @@ -783,7 +792,7 @@ void DwarfDebug::collectVariableInfoFromMFTable( if (!Scope) continue; - ensureAbstractVariableIsCreatedIfScoped(Var, Scope->getScopeNode()); + ensureAbstractVariableIsCreatedIfScoped(TheCU, Var, Scope->getScopeNode()); auto RegVar = make_unique<DbgVariable>(Var.first, Var.second); RegVar->initializeMMI(VI.Expr, VI.Slot); if (InfoHolder.addScopeVariable(Scope, RegVar.get())) @@ -954,24 +963,71 @@ DwarfDebug::buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc, } } -DbgVariable *DwarfDebug::createConcreteVariable(LexicalScope &Scope, +DbgVariable *DwarfDebug::createConcreteVariable(DwarfCompileUnit &TheCU, + LexicalScope &Scope, InlinedVariable IV) { - ensureAbstractVariableIsCreatedIfScoped(IV, Scope.getScopeNode()); + ensureAbstractVariableIsCreatedIfScoped(TheCU, IV, Scope.getScopeNode()); ConcreteVariables.push_back(make_unique<DbgVariable>(IV.first, IV.second)); InfoHolder.addScopeVariable(&Scope, ConcreteVariables.back().get()); return ConcreteVariables.back().get(); } -// Determine whether this DBG_VALUE is valid at the beginning of the function. -static bool validAtEntry(const MachineInstr *MInsn) { - auto MBB = MInsn->getParent(); - // Is it in the entry basic block? - if (!MBB->pred_empty()) +/// Determine whether a *singular* DBG_VALUE is valid for the entirety of its +/// enclosing lexical scope. The check ensures there are no other instructions +/// in the same lexical scope preceding the DBG_VALUE and that its range is +/// either open or otherwise rolls off the end of the scope. +static bool validThroughout(LexicalScopes &LScopes, + const MachineInstr *DbgValue, + const MachineInstr *RangeEnd) { + assert(DbgValue->getDebugLoc() && "DBG_VALUE without a debug location"); + auto MBB = DbgValue->getParent(); + auto DL = DbgValue->getDebugLoc(); + auto *LScope = LScopes.findLexicalScope(DL); + // Scope doesn't exist; this is a dead DBG_VALUE. + if (!LScope) return false; - for (MachineBasicBlock::const_reverse_iterator I(MInsn); I != MBB->rend(); ++I) - if (!(I->isDebugValue() || I->getFlag(MachineInstr::FrameSetup))) + auto &LSRange = LScope->getRanges(); + if (LSRange.size() == 0) + return false; + + // Determine if the DBG_VALUE is valid at the beginning of its lexical block. + const MachineInstr *LScopeBegin = LSRange.front().first; + // Early exit if the lexical scope begins outside of the current block. + if (LScopeBegin->getParent() != MBB) + return false; + MachineBasicBlock::const_reverse_iterator Pred(DbgValue); + for (++Pred; Pred != MBB->rend(); ++Pred) { + if (Pred->getFlag(MachineInstr::FrameSetup)) + break; + auto PredDL = Pred->getDebugLoc(); + if (!PredDL || Pred->isMetaInstruction()) + continue; + // Check whether the instruction preceding the DBG_VALUE is in the same + // (sub)scope as the DBG_VALUE. + if (DL->getScope() == PredDL->getScope()) return false; - return true; + auto *PredScope = LScopes.findLexicalScope(PredDL); + if (!PredScope || LScope->dominates(PredScope)) + return false; + } + + // If the range of the DBG_VALUE is open-ended, report success. + if (!RangeEnd) + return true; + + // Fail if there are instructions belonging to our scope in another block. + const MachineInstr *LScopeEnd = LSRange.back().second; + if (LScopeEnd->getParent() != MBB) + return false; + + // Single, constant DBG_VALUEs in the prologue are promoted to be live + // throughout the function. This is a hack, presumably for DWARF v2 and not + // necessarily correct. It would be much better to use a dbg.declare instead + // if we know the constant is live throughout the scope. + if (DbgValue->getOperand(0).isImm() && MBB->pred_empty()) + return true; + + return false; } // Find variables for each lexical scope. @@ -979,7 +1035,7 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU, const DISubprogram *SP, DenseSet<InlinedVariable> &Processed) { // Grab the variable info that was squirreled away in the MMI side-table. - collectVariableInfoFromMFTable(Processed); + collectVariableInfoFromMFTable(TheCU, Processed); for (const auto &I : DbgValues) { InlinedVariable IV = I.first; @@ -1001,16 +1057,14 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU, continue; Processed.insert(IV); - DbgVariable *RegVar = createConcreteVariable(*Scope, IV); + DbgVariable *RegVar = createConcreteVariable(TheCU, *Scope, IV); const MachineInstr *MInsn = Ranges.front().first; assert(MInsn->isDebugValue() && "History must begin with debug value"); - // Check if there is a single DBG_VALUE, valid throughout the function. - // A single constant is also considered valid for the entire function. + // Check if there is a single DBG_VALUE, valid throughout the var's scope. if (Ranges.size() == 1 && - (MInsn->getOperand(0).isImm() || - (validAtEntry(MInsn) && Ranges.front().second == nullptr))) { + validThroughout(LScopes, MInsn, Ranges.front().second)) { RegVar->initializeDbgValue(MInsn); continue; } @@ -1037,7 +1091,7 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU, for (const DILocalVariable *DV : SP->getVariables()) { if (Processed.insert(InlinedVariable(DV, nullptr)).second) if (LexicalScope *Scope = LScopes.findLexicalScope(DV->getScope())) - createConcreteVariable(*Scope, InlinedVariable(DV, nullptr)); + createConcreteVariable(TheCU, *Scope, InlinedVariable(DV, nullptr)); } } @@ -1046,8 +1100,12 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) { DebugHandlerBase::beginInstruction(MI); assert(CurMI); + const auto *SP = MI->getParent()->getParent()->getFunction()->getSubprogram(); + if (!SP || SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug) + return; + // Check if source location changes, but ignore DBG_VALUE and CFI locations. - if (MI->isDebugValue() || MI->isCFIInstruction()) + if (MI->isMetaInstruction()) return; const DebugLoc &DL = MI->getDebugLoc(); // When we emit a line-0 record, we don't update PrevInstLoc; so look at @@ -1129,7 +1187,7 @@ static DebugLoc findPrologueEndLoc(const MachineFunction *MF) { // the beginning of the function body. for (const auto &MBB : *MF) for (const auto &MI : MBB) - if (!MI.isDebugValue() && !MI.getFlag(MachineInstr::FrameSetup) && + if (!MI.isMetaInstruction() && !MI.getFlag(MachineInstr::FrameSetup) && MI.getDebugLoc()) return MI.getDebugLoc(); return DebugLoc(); @@ -1137,75 +1195,50 @@ static DebugLoc findPrologueEndLoc(const MachineFunction *MF) { // Gather pre-function debug information. Assumes being called immediately // after the function entry point has been emitted. -void DwarfDebug::beginFunction(const MachineFunction *MF) { +void DwarfDebug::beginFunctionImpl(const MachineFunction *MF) { CurFn = MF; - // If there's no debug info for the function we're not going to do anything. - if (!MMI->hasDebugInfo()) + auto *SP = MF->getFunction()->getSubprogram(); + assert(LScopes.empty() || SP == LScopes.getCurrentFunctionScope()->getScopeNode()); + if (SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug) return; - auto DI = MF->getFunction()->getSubprogram(); - if (!DI) - return; - - // Grab the lexical scopes for the function, if we don't have any of those - // then we're not going to be able to do anything. - DebugHandlerBase::beginFunction(MF); - if (LScopes.empty()) - return; + DwarfCompileUnit &CU = getOrCreateDwarfCompileUnit(SP->getUnit()); // Set DwarfDwarfCompileUnitID in MCContext to the Compile Unit this function // belongs to so that we add to the correct per-cu line table in the // non-asm case. - LexicalScope *FnScope = LScopes.getCurrentFunctionScope(); - // FnScope->getScopeNode() and DI->second should represent the same function, - // though they may not be the same MDNode due to inline functions merged in - // LTO where the debug info metadata still differs (either due to distinct - // written differences - two versions of a linkonce_odr function - // written/copied into two separate files, or some sub-optimal metadata that - // isn't structurally identical (see: file path/name info from clang, which - // includes the directory of the cpp file being built, even when the file name - // is absolute (such as an <> lookup header))) - auto *SP = cast<DISubprogram>(FnScope->getScopeNode()); - DwarfCompileUnit *TheCU = CUMap.lookup(SP->getUnit()); - if (!TheCU) { - assert(SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug && - "DICompileUnit missing from llvm.dbg.cu?"); - return; - } if (Asm->OutStreamer->hasRawTextSupport()) // Use a single line table if we are generating assembly. Asm->OutStreamer->getContext().setDwarfCompileUnitID(0); else - Asm->OutStreamer->getContext().setDwarfCompileUnitID(TheCU->getUniqueID()); + Asm->OutStreamer->getContext().setDwarfCompileUnitID(CU.getUniqueID()); // Record beginning of function. PrologEndLoc = findPrologueEndLoc(MF); - if (DILocation *L = PrologEndLoc) { + if (PrologEndLoc) { // We'd like to list the prologue as "not statements" but GDB behaves // poorly if we do that. Revisit this with caution/GDB (7.5+) testing. - auto *SP = L->getInlinedAtScope()->getSubprogram(); + auto *SP = PrologEndLoc->getInlinedAtScope()->getSubprogram(); recordSourceLine(SP->getScopeLine(), 0, SP, DWARF2_FLAG_IS_STMT); } } +void DwarfDebug::skippedNonDebugFunction() { + // If we don't have a subprogram for this function then there will be a hole + // in the range information. Keep note of this by setting the previously used + // section to nullptr. + PrevCU = nullptr; + CurFn = nullptr; +} + // Gather and emit post-function debug information. -void DwarfDebug::endFunction(const MachineFunction *MF) { +void DwarfDebug::endFunctionImpl(const MachineFunction *MF) { + const DISubprogram *SP = MF->getFunction()->getSubprogram(); + assert(CurFn == MF && "endFunction should be called with the same function as beginFunction"); - const DISubprogram *SP = MF->getFunction()->getSubprogram(); - if (!MMI->hasDebugInfo() || !SP || - SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug) { - // If we don't have a subprogram for this function then there will be a hole - // in the range information. Keep note of this by setting the previously - // used section to nullptr. - PrevCU = nullptr; - CurFn = nullptr; - DebugHandlerBase::endFunction(MF); - return; - } - // Set DwarfDwarfCompileUnitID in MCContext to default value. Asm->OutStreamer->getContext().setDwarfCompileUnitID(0); @@ -1220,17 +1253,14 @@ void DwarfDebug::endFunction(const MachineFunction *MF) { TheCU.addRange(RangeSpan(Asm->getFunctionBegin(), Asm->getFunctionEnd())); // Under -gmlt, skip building the subprogram if there are no inlined - // subroutines inside it. - if (TheCU.getCUNode()->getEmissionKind() == DICompileUnit::LineTablesOnly && + // subroutines inside it. But with -fdebug-info-for-profiling, the subprogram + // is still needed as we need its source location. + if (!TheCU.getCUNode()->getDebugInfoForProfiling() && + TheCU.getCUNode()->getEmissionKind() == DICompileUnit::LineTablesOnly && LScopes.getAbstractScopesList().empty() && !IsDarwin) { assert(InfoHolder.getScopeVariables().empty()); - assert(DbgValues.empty()); - // FIXME: This wouldn't be true in LTO with a -g (with inlining) CU followed - // by a -gmlt CU. Add a test and remove this assertion. - assert(AbstractVariables.empty()); PrevLabel = nullptr; CurFn = nullptr; - DebugHandlerBase::endFunction(MF); return; } @@ -1244,12 +1274,12 @@ void DwarfDebug::endFunction(const MachineFunction *MF) { for (const DILocalVariable *DV : SP->getVariables()) { if (!ProcessedVars.insert(InlinedVariable(DV, nullptr)).second) continue; - ensureAbstractVariableIsCreated(InlinedVariable(DV, nullptr), + ensureAbstractVariableIsCreated(TheCU, InlinedVariable(DV, nullptr), DV->getScope()); assert(LScopes.getAbstractScopesList().size() == NumAbstractScopes && "ensureAbstractVariableIsCreated inserted abstract scopes"); } - constructAbstractSubprogramScopeDIE(AScope); + constructAbstractSubprogramScopeDIE(TheCU, AScope); } ProcessedSPNodes.insert(SP); @@ -1266,7 +1296,6 @@ void DwarfDebug::endFunction(const MachineFunction *MF) { InfoHolder.getScopeVariables().clear(); PrevLabel = nullptr; CurFn = nullptr; - DebugHandlerBase::endFunction(MF); } // Register a source line with debug info. Returns the unique label that was @@ -1361,6 +1390,18 @@ void DwarfDebug::emitAccelTypes() { /// computeIndexValue - Compute the gdb index value for the DIE and CU. static dwarf::PubIndexEntryDescriptor computeIndexValue(DwarfUnit *CU, const DIE *Die) { + // Entities that ended up only in a Type Unit reference the CU instead (since + // the pub entry has offsets within the CU there's no real offset that can be + // provided anyway). As it happens all such entities (namespaces and types, + // types only in C++ at that) are rendered as TYPE+EXTERNAL. If this turns out + // not to be true it would be necessary to persist this information from the + // point at which the entry is added to the index data structure - since by + // the time the index is built from that, the original type/namespace DIE in a + // type unit has already been destroyed so it can't be queried for properties + // like tag, etc. + if (Die->getTag() == dwarf::DW_TAG_compile_unit) + return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_TYPE, + dwarf::GIEL_EXTERNAL); dwarf::GDBIndexEntryLinkage Linkage = dwarf::GIEL_STATIC; // We could have a specification DIE that has our most of our knowledge, @@ -1418,7 +1459,7 @@ void DwarfDebug::emitDebugPubSection( const auto &Globals = (TheU->*Accessor)(); - if (Globals.empty()) + if (!hasDwarfPubSections(TheU->includeMinimalInlineScopes())) continue; if (auto *Skeleton = TheU->getSkeleton()) @@ -1498,27 +1539,36 @@ static void emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT, ByteStreamer &Streamer, const DebugLocEntry::Value &Value, DwarfExpression &DwarfExpr) { - DIExpressionCursor ExprCursor(Value.getExpression()); - DwarfExpr.addFragmentOffset(Value.getExpression()); + auto *DIExpr = Value.getExpression(); + DIExpressionCursor ExprCursor(DIExpr); + DwarfExpr.addFragmentOffset(DIExpr); // Regular entry. if (Value.isInt()) { if (BT && (BT->getEncoding() == dwarf::DW_ATE_signed || BT->getEncoding() == dwarf::DW_ATE_signed_char)) - DwarfExpr.AddSignedConstant(Value.getInt()); + DwarfExpr.addSignedConstant(Value.getInt()); else - DwarfExpr.AddUnsignedConstant(Value.getInt()); + DwarfExpr.addUnsignedConstant(Value.getInt()); } else if (Value.isLocation()) { - MachineLocation Loc = Value.getLoc(); + MachineLocation Location = Value.getLoc(); + if (Location.isIndirect()) + DwarfExpr.setMemoryLocationKind(); + SmallVector<uint64_t, 8> Ops; + if (Location.isIndirect() && Location.getOffset()) { + Ops.push_back(dwarf::DW_OP_plus_uconst); + Ops.push_back(Location.getOffset()); + } + Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()); + DIExpressionCursor Cursor(Ops); const TargetRegisterInfo &TRI = *AP.MF->getSubtarget().getRegisterInfo(); - if (Loc.getOffset()) - DwarfExpr.AddMachineRegIndirect(TRI, Loc.getReg(), Loc.getOffset()); - else - DwarfExpr.AddMachineRegExpression(TRI, ExprCursor, Loc.getReg()); + if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg())) + return; + return DwarfExpr.addExpression(std::move(Cursor)); } else if (Value.isConstantFP()) { APInt RawBytes = Value.getConstantFP()->getValueAPF().bitcastToAPInt(); - DwarfExpr.AddUnsignedConstant(RawBytes); + DwarfExpr.addUnsignedConstant(RawBytes); } - DwarfExpr.AddExpression(std::move(ExprCursor)); + DwarfExpr.addExpression(std::move(ExprCursor)); } void DebugLocEntry::finalize(const AsmPrinter &AP, @@ -1558,10 +1608,13 @@ void DwarfDebug::emitDebugLocEntryLocation(const DebugLocStream::Entry &Entry) { // Emit locations into the debug loc section. void DwarfDebug::emitDebugLoc() { + if (DebugLocs.getLists().empty()) + return; + // Start the dwarf loc section. Asm->OutStreamer->SwitchSection( Asm->getObjFileLowering().getDwarfLocSection()); - unsigned char Size = Asm->getDataLayout().getPointerSize(); + unsigned char Size = Asm->MAI->getCodePointerSize(); for (const auto &List : DebugLocs.getLists()) { Asm->OutStreamer->EmitLabel(List.Label); const DwarfCompileUnit *CU = List.CU; @@ -1691,7 +1744,7 @@ void DwarfDebug::emitDebugARanges() { Asm->OutStreamer->SwitchSection( Asm->getObjFileLowering().getDwarfARangesSection()); - unsigned PtrSize = Asm->getDataLayout().getPointerSize(); + unsigned PtrSize = Asm->MAI->getCodePointerSize(); // Build a list of CUs used. std::vector<DwarfCompileUnit *> CUs; @@ -1769,12 +1822,15 @@ void DwarfDebug::emitDebugARanges() { /// Emit address ranges into a debug ranges section. void DwarfDebug::emitDebugRanges() { + if (CUMap.empty()) + return; + // Start the dwarf ranges section. Asm->OutStreamer->SwitchSection( Asm->getObjFileLowering().getDwarfRangesSection()); // Size for our labels. - unsigned char Size = Asm->getDataLayout().getPointerSize(); + unsigned char Size = Asm->MAI->getCodePointerSize(); // Grab the specific ranges for the compile units in the module. for (const auto &I : CUMap) { @@ -1848,6 +1904,9 @@ void DwarfDebug::emitMacroFile(DIMacroFile &F, DwarfCompileUnit &U) { /// Emit macros into a debug macinfo section. void DwarfDebug::emitDebugMacinfo() { + if (CUMap.empty()) + return; + // Start the dwarf macinfo section. Asm->OutStreamer->SwitchSection( Asm->getObjFileLowering().getDwarfMacinfoSection()); @@ -1869,7 +1928,7 @@ void DwarfDebug::emitDebugMacinfo() { void DwarfDebug::initSkeletonUnit(const DwarfUnit &U, DIE &Die, std::unique_ptr<DwarfCompileUnit> NewU) { NewU->addString(Die, dwarf::DW_AT_GNU_dwo_name, - U.getCUNode()->getSplitDebugFilename()); + Asm->TM.Options.MCOptions.SplitDwarfFile); if (!CompilationDir.empty()) NewU->addString(Die, dwarf::DW_AT_comp_dir, CompilationDir); @@ -1940,11 +1999,11 @@ uint64_t DwarfDebug::makeTypeSignature(StringRef Identifier) { MD5 Hash; Hash.update(Identifier); // ... take the least significant 8 bytes and return those. Our MD5 - // implementation always returns its results in little endian, swap bytes - // appropriately. + // implementation always returns its results in little endian, so we actually + // need the "high" word. MD5::MD5Result Result; Hash.final(Result); - return support::endian::read64le(Result + 8); + return Result.high(); } void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU, diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h index 253e3f0..5dfe06c 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h @@ -89,7 +89,7 @@ public: assert(!MInsn && "Already initialized?"); assert((!E || E->isValid()) && "Expected valid expression"); - assert(~FI && "Expected valid index"); + assert(FI != INT_MAX && "Expected valid index"); FrameIndexExprs.push_back({FI, E}); } @@ -134,6 +134,13 @@ public: assert(!FrameIndexExprs.empty() && "Expected an MMI entry"); assert(!V.FrameIndexExprs.empty() && "Expected an MMI entry"); + if (FrameIndexExprs.size()) { + auto *Expr = FrameIndexExprs.back().Expr; + // Get rid of duplicate non-fragment entries. More than one non-fragment + // dbg.declare makes no sense so ignore all but the first. + if (!Expr || !Expr->isFragment()) + return; + } FrameIndexExprs.append(V.FrameIndexExprs.begin(), V.FrameIndexExprs.end()); assert(all_of(FrameIndexExprs, [](FrameIndexExpr &FIE) { @@ -210,7 +217,6 @@ class DwarfDebug : public DebugHandlerBase { DenseMap<const MCSymbol *, uint64_t> SymSize; /// Collection of abstract variables. - DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables; SmallVector<std::unique_ptr<DbgVariable>, 64> ConcreteVariables; /// Collection of DebugLocEntry. Stored in a linked list so that DIELocLists @@ -247,9 +253,6 @@ class DwarfDebug : public DebugHandlerBase { std::pair<std::unique_ptr<DwarfTypeUnit>, const DICompositeType *>, 1> TypeUnitsUnderConstruction; - /// Whether to emit the pubnames/pubtypes sections. - bool HasDwarfPubSections; - /// Whether to use the GNU TLS opcode (instead of the standard opcode). bool UseGNUTLSOpcode; @@ -313,20 +316,16 @@ class DwarfDebug : public DebugHandlerBase { typedef DbgValueHistoryMap::InlinedVariable InlinedVariable; - /// Find abstract variable associated with Var. - DbgVariable *getExistingAbstractVariable(InlinedVariable IV, - const DILocalVariable *&Cleansed); - DbgVariable *getExistingAbstractVariable(InlinedVariable IV); - void createAbstractVariable(const DILocalVariable *DV, LexicalScope *Scope); - void ensureAbstractVariableIsCreated(InlinedVariable Var, + void ensureAbstractVariableIsCreated(DwarfCompileUnit &CU, InlinedVariable Var, const MDNode *Scope); - void ensureAbstractVariableIsCreatedIfScoped(InlinedVariable Var, + void ensureAbstractVariableIsCreatedIfScoped(DwarfCompileUnit &CU, InlinedVariable Var, const MDNode *Scope); - DbgVariable *createConcreteVariable(LexicalScope &Scope, InlinedVariable IV); + DbgVariable *createConcreteVariable(DwarfCompileUnit &TheCU, + LexicalScope &Scope, InlinedVariable IV); /// Construct a DIE for this abstract scope. - void constructAbstractSubprogramScopeDIE(LexicalScope *Scope); + void constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU, LexicalScope *Scope); void finishVariableDefinitions(); @@ -420,11 +419,11 @@ class DwarfDebug : public DebugHandlerBase { /// Flags to let the linker know we have emitted new style pubnames. Only /// emit it here if we don't have a skeleton CU for split dwarf. - void addGnuPubAttributes(DwarfUnit &U, DIE &D) const; + void addGnuPubAttributes(DwarfCompileUnit &U, DIE &D) const; /// Create new DwarfCompileUnit for the given metadata node with tag /// DW_TAG_compile_unit. - DwarfCompileUnit &constructDwarfCompileUnit(const DICompileUnit *DIUnit); + DwarfCompileUnit &getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit); /// Construct imported_module or imported_declaration DIE. void constructAndAddImportedEntityDIE(DwarfCompileUnit &TheCU, @@ -446,7 +445,17 @@ class DwarfDebug : public DebugHandlerBase { const DbgValueHistoryMap::InstrRanges &Ranges); /// Collect variable information from the side table maintained by MF. - void collectVariableInfoFromMFTable(DenseSet<InlinedVariable> &P); + void collectVariableInfoFromMFTable(DwarfCompileUnit &TheCU, + DenseSet<InlinedVariable> &P); + +protected: + /// Gather pre-function debug information. + void beginFunctionImpl(const MachineFunction *MF) override; + + /// Gather and emit post-function debug information. + void endFunctionImpl(const MachineFunction *MF) override; + + void skippedNonDebugFunction() override; public: //===--------------------------------------------------------------------===// @@ -463,12 +472,6 @@ public: /// Emit all Dwarf sections that should come after the content. void endModule() override; - /// Gather pre-function debug information. - void beginFunction(const MachineFunction *MF) override; - - /// Gather and emit post-function debug information. - void endFunction(const MachineFunction *MF) override; - /// Process beginning of an instruction. void beginInstruction(const MachineInstr *MI) override; @@ -515,6 +518,8 @@ public: /// split dwarf proposal support. bool useSplitDwarf() const { return HasSplitDwarf; } + bool shareAcrossDWOCUs() const; + /// Returns the Dwarf Version. uint16_t getDwarfVersion() const; @@ -555,6 +560,8 @@ public: /// A helper function to check whether the DIE for a given Scope is /// going to be null. bool isLexicalScopeDIENull(LexicalScope *Scope); + + bool hasDwarfPubSections(bool includeMinimalInlineScopes) const; }; } // End of namespace llvm diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp index 61b2c7e6..fe38ee8 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp @@ -14,87 +14,88 @@ #include "DwarfExpression.h" #include "DwarfDebug.h" #include "llvm/ADT/SmallBitVector.h" +#include "llvm/BinaryFormat/Dwarf.h" #include "llvm/CodeGen/AsmPrinter.h" -#include "llvm/Support/Dwarf.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; -void DwarfExpression::AddReg(int DwarfReg, const char *Comment) { - assert(DwarfReg >= 0 && "invalid negative dwarf register number"); - if (DwarfReg < 32) { - EmitOp(dwarf::DW_OP_reg0 + DwarfReg, Comment); +void DwarfExpression::addReg(int DwarfReg, const char *Comment) { + assert(DwarfReg >= 0 && "invalid negative dwarf register number"); + assert((LocationKind == Unknown || LocationKind == Register) && + "location description already locked down"); + LocationKind = Register; + if (DwarfReg < 32) { + emitOp(dwarf::DW_OP_reg0 + DwarfReg, Comment); } else { - EmitOp(dwarf::DW_OP_regx, Comment); - EmitUnsigned(DwarfReg); + emitOp(dwarf::DW_OP_regx, Comment); + emitUnsigned(DwarfReg); } } -void DwarfExpression::AddRegIndirect(int DwarfReg, int Offset, bool Deref) { +void DwarfExpression::addBReg(int DwarfReg, int Offset) { assert(DwarfReg >= 0 && "invalid negative dwarf register number"); + assert(LocationKind != Register && "location description already locked down"); if (DwarfReg < 32) { - EmitOp(dwarf::DW_OP_breg0 + DwarfReg); + emitOp(dwarf::DW_OP_breg0 + DwarfReg); } else { - EmitOp(dwarf::DW_OP_bregx); - EmitUnsigned(DwarfReg); + emitOp(dwarf::DW_OP_bregx); + emitUnsigned(DwarfReg); } - EmitSigned(Offset); - if (Deref) - EmitOp(dwarf::DW_OP_deref); + emitSigned(Offset); +} + +void DwarfExpression::addFBReg(int Offset) { + emitOp(dwarf::DW_OP_fbreg); + emitSigned(Offset); } -void DwarfExpression::AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits) { +void DwarfExpression::addOpPiece(unsigned SizeInBits, unsigned OffsetInBits) { if (!SizeInBits) return; const unsigned SizeOfByte = 8; if (OffsetInBits > 0 || SizeInBits % SizeOfByte) { - EmitOp(dwarf::DW_OP_bit_piece); - EmitUnsigned(SizeInBits); - EmitUnsigned(OffsetInBits); + emitOp(dwarf::DW_OP_bit_piece); + emitUnsigned(SizeInBits); + emitUnsigned(OffsetInBits); } else { - EmitOp(dwarf::DW_OP_piece); + emitOp(dwarf::DW_OP_piece); unsigned ByteSize = SizeInBits / SizeOfByte; - EmitUnsigned(ByteSize); + emitUnsigned(ByteSize); } this->OffsetInBits += SizeInBits; } -void DwarfExpression::AddShr(unsigned ShiftBy) { - EmitOp(dwarf::DW_OP_constu); - EmitUnsigned(ShiftBy); - EmitOp(dwarf::DW_OP_shr); +void DwarfExpression::addShr(unsigned ShiftBy) { + emitOp(dwarf::DW_OP_constu); + emitUnsigned(ShiftBy); + emitOp(dwarf::DW_OP_shr); } -bool DwarfExpression::AddMachineRegIndirect(const TargetRegisterInfo &TRI, - unsigned MachineReg, int Offset) { - if (isFrameRegister(TRI, MachineReg)) { - // If variable offset is based in frame register then use fbreg. - EmitOp(dwarf::DW_OP_fbreg); - EmitSigned(Offset); - return true; - } - - int DwarfReg = TRI.getDwarfRegNum(MachineReg, false); - if (DwarfReg < 0) - return false; - - AddRegIndirect(DwarfReg, Offset); - return true; +void DwarfExpression::addAnd(unsigned Mask) { + emitOp(dwarf::DW_OP_constu); + emitUnsigned(Mask); + emitOp(dwarf::DW_OP_and); } -bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI, +bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI, unsigned MachineReg, unsigned MaxSize) { - if (!TRI.isPhysicalRegister(MachineReg)) + if (!TRI.isPhysicalRegister(MachineReg)) { + if (isFrameRegister(TRI, MachineReg)) { + DwarfRegs.push_back({-1, 0, nullptr}); + return true; + } return false; + } int Reg = TRI.getDwarfRegNum(MachineReg, false); // If this is a valid register number, emit it. if (Reg >= 0) { - AddReg(Reg); + DwarfRegs.push_back({Reg, 0, nullptr}); return true; } @@ -106,7 +107,7 @@ bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI, unsigned Idx = TRI.getSubRegIndex(*SR, MachineReg); unsigned Size = TRI.getSubRegIdxSize(Idx); unsigned RegOffset = TRI.getSubRegIdxOffset(Idx); - AddReg(Reg, "super-register"); + DwarfRegs.push_back({Reg, 0, "super-register"}); // Use a DW_OP_bit_piece to describe the sub-register. setSubRegisterPiece(Size, RegOffset); return true; @@ -116,8 +117,9 @@ bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI, // Otherwise, attempt to find a covering set of sub-register numbers. // For example, Q0 on ARM is a composition of D0+D1. unsigned CurPos = 0; - // The size of the register in bits, assuming 8 bits per byte. - unsigned RegSize = TRI.getMinimalPhysRegClass(MachineReg)->getSize() * 8; + // The size of the register in bits. + const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(MachineReg); + unsigned RegSize = TRI.getRegSizeInBits(*RC); // Keep track of the bits in the register we already emitted, so we // can avoid emitting redundant aliasing subregs. SmallBitVector Coverage(RegSize, false); @@ -136,100 +138,166 @@ bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI, // If this sub-register has a DWARF number and we haven't covered // its range, emit a DWARF piece for it. if (Reg >= 0 && Intersection.any()) { - AddReg(Reg, "sub-register"); + // Emit a piece for any gap in the coverage. + if (Offset > CurPos) + DwarfRegs.push_back({-1, Offset - CurPos, nullptr}); + DwarfRegs.push_back( + {Reg, std::min<unsigned>(Size, MaxSize - Offset), "sub-register"}); if (Offset >= MaxSize) break; - // Emit a piece for the any gap in the coverage. - if (Offset > CurPos) - AddOpPiece(Offset - CurPos); - AddOpPiece(std::min<unsigned>(Size, MaxSize - Offset)); - CurPos = Offset + Size; // Mark it as emitted. Coverage.set(Offset, Offset + Size); + CurPos = Offset + Size; } } return CurPos; } -void DwarfExpression::AddStackValue() { +void DwarfExpression::addStackValue() { if (DwarfVersion >= 4) - EmitOp(dwarf::DW_OP_stack_value); + emitOp(dwarf::DW_OP_stack_value); } -void DwarfExpression::AddSignedConstant(int64_t Value) { - EmitOp(dwarf::DW_OP_consts); - EmitSigned(Value); - AddStackValue(); +void DwarfExpression::addSignedConstant(int64_t Value) { + assert(LocationKind == Implicit || LocationKind == Unknown); + LocationKind = Implicit; + emitOp(dwarf::DW_OP_consts); + emitSigned(Value); } -void DwarfExpression::AddUnsignedConstant(uint64_t Value) { - EmitOp(dwarf::DW_OP_constu); - EmitUnsigned(Value); - AddStackValue(); +void DwarfExpression::addUnsignedConstant(uint64_t Value) { + assert(LocationKind == Implicit || LocationKind == Unknown); + LocationKind = Implicit; + emitOp(dwarf::DW_OP_constu); + emitUnsigned(Value); } -void DwarfExpression::AddUnsignedConstant(const APInt &Value) { +void DwarfExpression::addUnsignedConstant(const APInt &Value) { + assert(LocationKind == Implicit || LocationKind == Unknown); + LocationKind = Implicit; + unsigned Size = Value.getBitWidth(); const uint64_t *Data = Value.getRawData(); // Chop it up into 64-bit pieces, because that's the maximum that - // AddUnsignedConstant takes. + // addUnsignedConstant takes. unsigned Offset = 0; while (Offset < Size) { - AddUnsignedConstant(*Data++); + addUnsignedConstant(*Data++); if (Offset == 0 && Size <= 64) break; - AddOpPiece(std::min(Size-Offset, 64u), Offset); + addStackValue(); + addOpPiece(std::min(Size - Offset, 64u), Offset); Offset += 64; } } -bool DwarfExpression::AddMachineRegExpression(const TargetRegisterInfo &TRI, +bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI, DIExpressionCursor &ExprCursor, unsigned MachineReg, unsigned FragmentOffsetInBits) { - if (!ExprCursor) - return AddMachineReg(TRI, MachineReg); + auto Fragment = ExprCursor.getFragmentInfo(); + if (!addMachineReg(TRI, MachineReg, Fragment ? Fragment->SizeInBits : ~1U)) { + LocationKind = Unknown; + return false; + } - // Pattern-match combinations for which more efficient representations exist - // first. - bool ValidReg = false; + bool HasComplexExpression = false; auto Op = ExprCursor.peek(); - switch (Op->getOp()) { - default: { - auto Fragment = ExprCursor.getFragmentInfo(); - ValidReg = AddMachineReg(TRI, MachineReg, - Fragment ? Fragment->SizeInBits : ~1U); - break; + if (Op && Op->getOp() != dwarf::DW_OP_LLVM_fragment) + HasComplexExpression = true; + + // If the register can only be described by a complex expression (i.e., + // multiple subregisters) it doesn't safely compose with another complex + // expression. For example, it is not possible to apply a DW_OP_deref + // operation to multiple DW_OP_pieces. + if (HasComplexExpression && DwarfRegs.size() > 1) { + DwarfRegs.clear(); + LocationKind = Unknown; + return false; } - case dwarf::DW_OP_plus: - case dwarf::DW_OP_minus: { - // [DW_OP_reg,Offset,DW_OP_plus, DW_OP_deref] --> [DW_OP_breg, Offset]. - // [DW_OP_reg,Offset,DW_OP_minus,DW_OP_deref] --> [DW_OP_breg,-Offset]. - auto N = ExprCursor.peekNext(); - if (N && N->getOp() == dwarf::DW_OP_deref) { - unsigned Offset = Op->getArg(0); - ValidReg = AddMachineRegIndirect( - TRI, MachineReg, Op->getOp() == dwarf::DW_OP_plus ? Offset : -Offset); - ExprCursor.consume(2); - } else - ValidReg = AddMachineReg(TRI, MachineReg); - break; + + // Handle simple register locations. + if (LocationKind != Memory && !HasComplexExpression) { + for (auto &Reg : DwarfRegs) { + if (Reg.DwarfRegNo >= 0) + addReg(Reg.DwarfRegNo, Reg.Comment); + addOpPiece(Reg.Size); + } + DwarfRegs.clear(); + return true; } - case dwarf::DW_OP_deref: - // [DW_OP_reg,DW_OP_deref] --> [DW_OP_breg]. - ValidReg = AddMachineRegIndirect(TRI, MachineReg); + + // Don't emit locations that cannot be expressed without DW_OP_stack_value. + if (DwarfVersion < 4) + if (std::any_of(ExprCursor.begin(), ExprCursor.end(), + [](DIExpression::ExprOperand Op) -> bool { + return Op.getOp() == dwarf::DW_OP_stack_value; + })) { + DwarfRegs.clear(); + LocationKind = Unknown; + return false; + } + + assert(DwarfRegs.size() == 1); + auto Reg = DwarfRegs[0]; + bool FBReg = isFrameRegister(TRI, MachineReg); + int SignedOffset = 0; + assert(Reg.Size == 0 && "subregister has same size as superregister"); + + // Pattern-match combinations for which more efficient representations exist. + // [Reg, DW_OP_plus_uconst, Offset] --> [DW_OP_breg, Offset]. + if (Op && (Op->getOp() == dwarf::DW_OP_plus_uconst)) { + SignedOffset = Op->getArg(0); ExprCursor.take(); - break; } - return ValidReg; + // [Reg, DW_OP_constu, Offset, DW_OP_plus] --> [DW_OP_breg, Offset] + // [Reg, DW_OP_constu, Offset, DW_OP_minus] --> [DW_OP_breg,-Offset] + // If Reg is a subregister we need to mask it out before subtracting. + if (Op && Op->getOp() == dwarf::DW_OP_constu) { + auto N = ExprCursor.peekNext(); + if (N && (N->getOp() == dwarf::DW_OP_plus || + (N->getOp() == dwarf::DW_OP_minus && !SubRegisterSizeInBits))) { + int Offset = Op->getArg(0); + SignedOffset = (N->getOp() == dwarf::DW_OP_minus) ? -Offset : Offset; + ExprCursor.consume(2); + } + } + + if (FBReg) + addFBReg(SignedOffset); + else + addBReg(Reg.DwarfRegNo, SignedOffset); + DwarfRegs.clear(); + return true; +} + +/// Assuming a well-formed expression, match "DW_OP_deref* DW_OP_LLVM_fragment?". +static bool isMemoryLocation(DIExpressionCursor ExprCursor) { + while (ExprCursor) { + auto Op = ExprCursor.take(); + switch (Op->getOp()) { + case dwarf::DW_OP_deref: + case dwarf::DW_OP_LLVM_fragment: + break; + default: + return false; + } + } + return true; } -void DwarfExpression::AddExpression(DIExpressionCursor &&ExprCursor, +void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor, unsigned FragmentOffsetInBits) { + // If we need to mask out a subregister, do it now, unless the next + // operation would emit an OpPiece anyway. + auto N = ExprCursor.peek(); + if (SubRegisterSizeInBits && N && (N->getOp() != dwarf::DW_OP_LLVM_fragment)) + maskSubRegister(); + while (ExprCursor) { auto Op = ExprCursor.take(); switch (Op->getOp()) { @@ -241,49 +309,91 @@ void DwarfExpression::AddExpression(DIExpressionCursor &&ExprCursor, // location. assert(OffsetInBits >= FragmentOffset && "fragment offset not added?"); - // If \a AddMachineReg already emitted DW_OP_piece operations to represent + // If addMachineReg already emitted DW_OP_piece operations to represent // a super-register by splicing together sub-registers, subtract the size // of the pieces that was already emitted. SizeInBits -= OffsetInBits - FragmentOffset; - // If \a AddMachineReg requested a DW_OP_bit_piece to stencil out a + // If addMachineReg requested a DW_OP_bit_piece to stencil out a // sub-register that is smaller than the current fragment's size, use it. if (SubRegisterSizeInBits) SizeInBits = std::min<unsigned>(SizeInBits, SubRegisterSizeInBits); - - AddOpPiece(SizeInBits, SubRegisterOffsetInBits); + + // Emit a DW_OP_stack_value for implicit location descriptions. + if (LocationKind == Implicit) + addStackValue(); + + // Emit the DW_OP_piece. + addOpPiece(SizeInBits, SubRegisterOffsetInBits); setSubRegisterPiece(0, 0); - break; + // Reset the location description kind. + LocationKind = Unknown; + return; } - case dwarf::DW_OP_plus: - EmitOp(dwarf::DW_OP_plus_uconst); - EmitUnsigned(Op->getArg(0)); + case dwarf::DW_OP_plus_uconst: + assert(LocationKind != Register); + emitOp(dwarf::DW_OP_plus_uconst); + emitUnsigned(Op->getArg(0)); break; + case dwarf::DW_OP_plus: case dwarf::DW_OP_minus: - // There is no OP_minus_uconst. - EmitOp(dwarf::DW_OP_constu); - EmitUnsigned(Op->getArg(0)); - EmitOp(dwarf::DW_OP_minus); + emitOp(Op->getOp()); break; - case dwarf::DW_OP_deref: - EmitOp(dwarf::DW_OP_deref); + case dwarf::DW_OP_deref: { + assert(LocationKind != Register); + if (LocationKind != Memory && isMemoryLocation(ExprCursor)) + // Turning this into a memory location description makes the deref + // implicit. + LocationKind = Memory; + else + emitOp(dwarf::DW_OP_deref); break; + } case dwarf::DW_OP_constu: - EmitOp(dwarf::DW_OP_constu); - EmitUnsigned(Op->getArg(0)); + assert(LocationKind != Register); + emitOp(dwarf::DW_OP_constu); + emitUnsigned(Op->getArg(0)); break; case dwarf::DW_OP_stack_value: - AddStackValue(); + LocationKind = Implicit; + break; + case dwarf::DW_OP_swap: + assert(LocationKind != Register); + emitOp(dwarf::DW_OP_swap); + break; + case dwarf::DW_OP_xderef: + assert(LocationKind != Register); + emitOp(dwarf::DW_OP_xderef); break; default: llvm_unreachable("unhandled opcode found in expression"); } } + + if (LocationKind == Implicit) + // Turn this into an implicit location description. + addStackValue(); +} + +/// add masking operations to stencil out a subregister. +void DwarfExpression::maskSubRegister() { + assert(SubRegisterSizeInBits && "no subregister was registered"); + if (SubRegisterOffsetInBits > 0) + addShr(SubRegisterOffsetInBits); + uint64_t Mask = (1ULL << (uint64_t)SubRegisterSizeInBits) - 1ULL; + addAnd(Mask); } + void DwarfExpression::finalize() { - if (SubRegisterSizeInBits) - AddOpPiece(SubRegisterSizeInBits, SubRegisterOffsetInBits); + assert(DwarfRegs.size() == 0 && "dwarf registers not emitted"); + // Emit any outstanding DW_OP_piece operations to mask out subregisters. + if (SubRegisterSizeInBits == 0) + return; + // Don't emit a DW_OP_piece for a subregister at offset 0. + if (SubRegisterOffsetInBits == 0) + return; + addOpPiece(SubRegisterSizeInBits, SubRegisterOffsetInBits); } void DwarfExpression::addFragmentOffset(const DIExpression *Expr) { @@ -294,6 +404,6 @@ void DwarfExpression::addFragmentOffset(const DIExpression *Expr) { assert(FragmentOffset >= OffsetInBits && "overlapping or duplicate fragments"); if (FragmentOffset > OffsetInBits) - AddOpPiece(FragmentOffset - OffsetInBits); + addOpPiece(FragmentOffset - OffsetInBits); OffsetInBits = FragmentOffset; } diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h index fd90fa0..728f8ad 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h @@ -42,6 +42,9 @@ public: DIExpressionCursor(ArrayRef<uint64_t> Expr) : Start(Expr.begin()), End(Expr.end()) {} + DIExpressionCursor(const DIExpressionCursor &C) + : Start(C.Start), End(C.End) {} + /// Consume one operation. Optional<DIExpression::ExprOperand> take() { if (Start == End) @@ -72,6 +75,8 @@ public: } /// Determine whether there are any operations left in this expression. operator bool() const { return Start != End; } + DIExpression::expr_op_iterator begin() const { return Start; } + DIExpression::expr_op_iterator end() const { return End; } /// Retrieve the fragment information, if any. Optional<DIExpression::FragmentInfo> getFragmentInfo() const { @@ -84,14 +89,27 @@ public: /// entry. class DwarfExpression { protected: - unsigned DwarfVersion; + /// Holds information about all subregisters comprising a register location. + struct Register { + int DwarfRegNo; + unsigned Size; + const char *Comment; + }; + + /// The register location, if any. + SmallVector<Register, 2> DwarfRegs; + /// Current Fragment Offset in Bits. uint64_t OffsetInBits = 0; + unsigned DwarfVersion; /// Sometimes we need to add a DW_OP_bit_piece to describe a subregister. unsigned SubRegisterSizeInBits = 0; unsigned SubRegisterOffsetInBits = 0; + /// The kind of location description being produced. + enum { Unknown = 0, Register, Memory, Implicit } LocationKind = Unknown; + /// Push a DW_OP_piece / DW_OP_bit_piece for emitting later, if one is needed /// to represent a subregister. void setSubRegisterPiece(unsigned SizeInBits, unsigned OffsetInBits) { @@ -99,35 +117,55 @@ protected: SubRegisterOffsetInBits = OffsetInBits; } -public: - DwarfExpression(unsigned DwarfVersion) : DwarfVersion(DwarfVersion) {} - virtual ~DwarfExpression() {}; - - /// This needs to be called last to commit any pending changes. - void finalize(); + /// Add masking operations to stencil out a subregister. + void maskSubRegister(); /// Output a dwarf operand and an optional assembler comment. - virtual void EmitOp(uint8_t Op, const char *Comment = nullptr) = 0; + virtual void emitOp(uint8_t Op, const char *Comment = nullptr) = 0; /// Emit a raw signed value. - virtual void EmitSigned(int64_t Value) = 0; + virtual void emitSigned(int64_t Value) = 0; /// Emit a raw unsigned value. - virtual void EmitUnsigned(uint64_t Value) = 0; + virtual void emitUnsigned(uint64_t Value) = 0; /// Return whether the given machine register is the frame register in the /// current function. virtual bool isFrameRegister(const TargetRegisterInfo &TRI, unsigned MachineReg) = 0; - /// Emit a dwarf register operation. - void AddReg(int DwarfReg, const char *Comment = nullptr); - /// Emit an (double-)indirect dwarf register operation. - void AddRegIndirect(int DwarfReg, int Offset, bool Deref = false); + /// Emit a DW_OP_reg operation. Note that this is only legal inside a DWARF + /// register location description. + void addReg(int DwarfReg, const char *Comment = nullptr); + /// Emit a DW_OP_breg operation. + void addBReg(int DwarfReg, int Offset); + /// Emit DW_OP_fbreg <Offset>. + void addFBReg(int Offset); + + /// Emit a partial DWARF register operation. + /// + /// \param MachineReg The register number. + /// \param MaxSize If the register must be composed from + /// sub-registers this is an upper bound + /// for how many bits the emitted DW_OP_piece + /// may cover. + /// + /// If size and offset is zero an operation for the entire register is + /// emitted: Some targets do not provide a DWARF register number for every + /// register. If this is the case, this function will attempt to emit a DWARF + /// register by emitting a fragment of a super-register or by piecing together + /// multiple subregisters that alias the register. + /// + /// \return false if no DWARF register exists for MachineReg. + bool addMachineReg(const TargetRegisterInfo &TRI, unsigned MachineReg, + unsigned MaxSize = ~1U); + /// Emit a DW_OP_piece or DW_OP_bit_piece operation for a variable fragment. /// \param OffsetInBits This is an optional offset into the location that /// is at the top of the DWARF stack. - void AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits = 0); + void addOpPiece(unsigned SizeInBits, unsigned OffsetInBits = 0); - /// Emit a shift-right dwarf expression. - void AddShr(unsigned ShiftBy); + /// Emit a shift-right dwarf operation. + void addShr(unsigned ShiftBy); + /// Emit a bitwise and dwarf operation. + void addAnd(unsigned Mask); /// Emit a DW_OP_stack_value, if supported. /// @@ -140,48 +178,39 @@ public: /// constant value, so the producers and consumers started to rely on /// heuristics to disambiguate the value vs. location status of the /// expression. See PR21176 for more details. - void AddStackValue(); + void addStackValue(); - /// Emit an indirect dwarf register operation for the given machine register. - /// \return false if no DWARF register exists for MachineReg. - bool AddMachineRegIndirect(const TargetRegisterInfo &TRI, unsigned MachineReg, - int Offset = 0); + ~DwarfExpression() = default; +public: + DwarfExpression(unsigned DwarfVersion) : DwarfVersion(DwarfVersion) {} - /// Emit a partial DWARF register operation. - /// - /// \param MachineReg The register number. - /// \param MaxSize If the register must be composed from - /// sub-registers this is an upper bound - /// for how many bits the emitted DW_OP_piece - /// may cover. - /// - /// If size and offset is zero an operation for the entire register is - /// emitted: Some targets do not provide a DWARF register number for every - /// register. If this is the case, this function will attempt to emit a DWARF - /// register by emitting a fragment of a super-register or by piecing together - /// multiple subregisters that alias the register. - /// - /// \return false if no DWARF register exists for MachineReg. - bool AddMachineReg(const TargetRegisterInfo &TRI, unsigned MachineReg, - unsigned MaxSize = ~1U); + /// This needs to be called last to commit any pending changes. + void finalize(); /// Emit a signed constant. - void AddSignedConstant(int64_t Value); + void addSignedConstant(int64_t Value); /// Emit an unsigned constant. - void AddUnsignedConstant(uint64_t Value); + void addUnsignedConstant(uint64_t Value); /// Emit an unsigned constant. - void AddUnsignedConstant(const APInt &Value); + void addUnsignedConstant(const APInt &Value); + + /// Lock this down to become a memory location description. + void setMemoryLocationKind() { + assert(LocationKind == Unknown); + LocationKind = Memory; + } /// Emit a machine register location. As an optimization this may also consume /// the prefix of a DwarfExpression if a more efficient representation for /// combining the register location and the first operation exists. /// - /// \param FragmentOffsetInBits If this is one fragment out of a fragmented + /// \param FragmentOffsetInBits If this is one fragment out of a + /// fragmented /// location, this is the offset of the /// fragment inside the entire variable. /// \return false if no DWARF register exists /// for MachineReg. - bool AddMachineRegExpression(const TargetRegisterInfo &TRI, + bool addMachineRegExpression(const TargetRegisterInfo &TRI, DIExpressionCursor &Expr, unsigned MachineReg, unsigned FragmentOffsetInBits = 0); /// Emit all remaining operations in the DIExpressionCursor. @@ -189,7 +218,7 @@ public: /// \param FragmentOffsetInBits If this is one fragment out of multiple /// locations, this is the offset of the /// fragment inside the entire variable. - void AddExpression(DIExpressionCursor &&Expr, + void addExpression(DIExpressionCursor &&Expr, unsigned FragmentOffsetInBits = 0); /// If applicable, emit an empty DW_OP_piece / DW_OP_bit_piece to advance to @@ -198,33 +227,32 @@ public: }; /// DwarfExpression implementation for .debug_loc entries. -class DebugLocDwarfExpression : public DwarfExpression { +class DebugLocDwarfExpression final : public DwarfExpression { ByteStreamer &BS; + void emitOp(uint8_t Op, const char *Comment = nullptr) override; + void emitSigned(int64_t Value) override; + void emitUnsigned(uint64_t Value) override; + bool isFrameRegister(const TargetRegisterInfo &TRI, + unsigned MachineReg) override; public: DebugLocDwarfExpression(unsigned DwarfVersion, ByteStreamer &BS) : DwarfExpression(DwarfVersion), BS(BS) {} - - void EmitOp(uint8_t Op, const char *Comment = nullptr) override; - void EmitSigned(int64_t Value) override; - void EmitUnsigned(uint64_t Value) override; - bool isFrameRegister(const TargetRegisterInfo &TRI, - unsigned MachineReg) override; }; /// DwarfExpression implementation for singular DW_AT_location. -class DIEDwarfExpression : public DwarfExpression { +class DIEDwarfExpression final : public DwarfExpression { const AsmPrinter &AP; DwarfUnit &DU; DIELoc &DIE; -public: - DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU, DIELoc &DIE); - void EmitOp(uint8_t Op, const char *Comment = nullptr) override; - void EmitSigned(int64_t Value) override; - void EmitUnsigned(uint64_t Value) override; + void emitOp(uint8_t Op, const char *Comment = nullptr) override; + void emitSigned(int64_t Value) override; + void emitUnsigned(uint64_t Value) override; bool isFrameRegister(const TargetRegisterInfo &TRI, unsigned MachineReg) override; +public: + DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU, DIELoc &DIE); DIELoc *finalize() { DwarfExpression::finalize(); return &DIE; diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h index d4d2ed2..54924e9 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h @@ -53,6 +53,7 @@ class DwarfFile { // Collection of abstract subprogram DIEs. DenseMap<const MDNode *, DIE *> AbstractSPDies; + DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables; /// Maps MDNodes for type system with the corresponding DIEs. These DIEs can /// be shared across CUs, that is why we keep the map here instead @@ -105,6 +106,9 @@ public: DenseMap<const MDNode *, DIE *> &getAbstractSPDies() { return AbstractSPDies; } + DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> &getAbstractVariables() { + return AbstractVariables; + } void insertDIE(const MDNode *TypeMD, DIE *Die) { DITypeNodeToDieMap.insert(std::make_pair(TypeMD, Die)); diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index 2a866c0..4f4ebfc 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -18,18 +18,19 @@ #include "DwarfExpression.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" -#include "llvm/ADT/iterator_range.h" #include "llvm/ADT/None.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Metadata.h" -#include "llvm/MC/MachineLocation.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MachineLocation.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetLoweringObjectFile.h" @@ -54,15 +55,15 @@ DIEDwarfExpression::DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU, : DwarfExpression(AP.getDwarfVersion()), AP(AP), DU(DU), DIE(DIE) {} -void DIEDwarfExpression::EmitOp(uint8_t Op, const char* Comment) { +void DIEDwarfExpression::emitOp(uint8_t Op, const char* Comment) { DU.addUInt(DIE, dwarf::DW_FORM_data1, Op); } -void DIEDwarfExpression::EmitSigned(int64_t Value) { +void DIEDwarfExpression::emitSigned(int64_t Value) { DU.addSInt(DIE, dwarf::DW_FORM_sdata, Value); } -void DIEDwarfExpression::EmitUnsigned(uint64_t Value) { +void DIEDwarfExpression::emitUnsigned(uint64_t Value) { DU.addUInt(DIE, dwarf::DW_FORM_udata, Value); } @@ -73,8 +74,8 @@ bool DIEDwarfExpression::isFrameRegister(const TargetRegisterInfo &TRI, DwarfUnit::DwarfUnit(dwarf::Tag UnitTag, const DICompileUnit *Node, AsmPrinter *A, DwarfDebug *DW, DwarfFile *DWU) - : DIEUnit(A->getDwarfVersion(), A->getPointerSize(), UnitTag), CUNode(Node), - Asm(A), DD(DW), DU(DWU), IndexTyDie(nullptr) { + : DIEUnit(A->getDwarfVersion(), A->MAI->getCodePointerSize(), UnitTag), + CUNode(Node), Asm(A), DD(DW), DU(DWU), IndexTyDie(nullptr) { } DwarfTypeUnit::DwarfTypeUnit(DwarfCompileUnit &CU, AsmPrinter *A, @@ -98,25 +99,35 @@ int64_t DwarfUnit::getDefaultLowerBound() const { default: break; - case dwarf::DW_LANG_C89: - case dwarf::DW_LANG_C99: + // The languages below have valid values in all DWARF versions. case dwarf::DW_LANG_C: + case dwarf::DW_LANG_C89: case dwarf::DW_LANG_C_plus_plus: - case dwarf::DW_LANG_ObjC: - case dwarf::DW_LANG_ObjC_plus_plus: return 0; case dwarf::DW_LANG_Fortran77: case dwarf::DW_LANG_Fortran90: - case dwarf::DW_LANG_Fortran95: return 1; - // The languages below have valid values only if the DWARF version >= 4. + // The languages below have valid values only if the DWARF version >= 3. + case dwarf::DW_LANG_C99: + case dwarf::DW_LANG_ObjC: + case dwarf::DW_LANG_ObjC_plus_plus: + if (DD->getDwarfVersion() >= 3) + return 0; + break; + + case dwarf::DW_LANG_Fortran95: + if (DD->getDwarfVersion() >= 3) + return 1; + break; + + // Starting with DWARF v4, all defined languages have valid values. + case dwarf::DW_LANG_D: case dwarf::DW_LANG_Java: case dwarf::DW_LANG_Python: case dwarf::DW_LANG_UPC: - case dwarf::DW_LANG_D: - if (dwarf::DWARF_VERSION >= 4) + if (DD->getDwarfVersion() >= 4) return 0; break; @@ -127,31 +138,33 @@ int64_t DwarfUnit::getDefaultLowerBound() const { case dwarf::DW_LANG_Modula2: case dwarf::DW_LANG_Pascal83: case dwarf::DW_LANG_PLI: - if (dwarf::DWARF_VERSION >= 4) + if (DD->getDwarfVersion() >= 4) return 1; break; - // The languages below have valid values only if the DWARF version >= 5. - case dwarf::DW_LANG_OpenCL: - case dwarf::DW_LANG_Go: - case dwarf::DW_LANG_Haskell: + // The languages below are new in DWARF v5. + case dwarf::DW_LANG_BLISS: + case dwarf::DW_LANG_C11: case dwarf::DW_LANG_C_plus_plus_03: case dwarf::DW_LANG_C_plus_plus_11: + case dwarf::DW_LANG_C_plus_plus_14: + case dwarf::DW_LANG_Dylan: + case dwarf::DW_LANG_Go: + case dwarf::DW_LANG_Haskell: case dwarf::DW_LANG_OCaml: + case dwarf::DW_LANG_OpenCL: + case dwarf::DW_LANG_RenderScript: case dwarf::DW_LANG_Rust: - case dwarf::DW_LANG_C11: case dwarf::DW_LANG_Swift: - case dwarf::DW_LANG_Dylan: - case dwarf::DW_LANG_C_plus_plus_14: - if (dwarf::DWARF_VERSION >= 5) + if (DD->getDwarfVersion() >= 5) return 0; break; - case dwarf::DW_LANG_Modula3: - case dwarf::DW_LANG_Julia: case dwarf::DW_LANG_Fortran03: case dwarf::DW_LANG_Fortran08: - if (dwarf::DWARF_VERSION >= 5) + case dwarf::DW_LANG_Julia: + case dwarf::DW_LANG_Modula3: + if (DD->getDwarfVersion() >= 5) return 1; break; } @@ -160,7 +173,7 @@ int64_t DwarfUnit::getDefaultLowerBound() const { } /// Check whether the DIE for this MDNode can be shared across CUs. -static bool isShareableAcrossCUs(const DINode *D) { +bool DwarfUnit::isShareableAcrossCUs(const DINode *D) const { // When the MDNode can be part of the type system, the DIE can be shared // across CUs. // Combining type units and cross-CU DIE sharing is lower value (since @@ -168,6 +181,8 @@ static bool isShareableAcrossCUs(const DINode *D) { // level already) but may be implementable for some value in projects // building multiple independent libraries with LTO and then linking those // together. + if (isDwoUnit() && !DD->shareAcrossDWOCUs()) + return false; return (isa<DIType>(D) || (isa<DISubprogram>(D) && !cast<DISubprogram>(D)->isDefinition())) && !GenerateDwarfTypeUnits; @@ -285,13 +300,6 @@ void DwarfUnit::addDIETypeSignature(DIE &Die, uint64_t Signature) { dwarf::DW_FORM_ref_sig8, DIEInteger(Signature)); } -void DwarfUnit::addDIETypeSignature(DIE &Die, dwarf::Attribute Attribute, - StringRef Identifier) { - uint64_t Signature = DD->makeTypeSignature(Identifier); - Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_ref_sig8, - DIEInteger(Signature)); -} - void DwarfUnit::addDIEEntry(DIE &Die, dwarf::Attribute Attribute, DIEEntry Entry) { const DIEUnit *CU = Die.getUnit(); @@ -369,10 +377,6 @@ void DwarfUnit::addSourceLine(DIE &Die, const DIObjCProperty *Ty) { addSourceLine(Die, Ty->getLine(), Ty->getFilename(), Ty->getDirectory()); } -void DwarfUnit::addSourceLine(DIE &Die, const DINamespace *NS) { - addSourceLine(Die, NS->getLine(), NS->getFilename(), NS->getDirectory()); -} - /* Byref variables, in Blocks, are declared by the programmer as "SomeType VarName;", but the compiler creates a __Block_byref_x_VarName struct, and gives the variable VarName either the struct, or a pointer to the struct, as @@ -465,50 +469,48 @@ void DwarfUnit::addBlockByrefAddress(const DbgVariable &DV, DIE &Die, // Decode the original location, and use that as the start of the byref // variable's location. DIELoc *Loc = new (DIEValueAllocator) DIELoc; - SmallVector<uint64_t, 6> DIExpr; - DIEDwarfExpression Expr(*Asm, *this, *Loc); - - bool validReg; - if (Location.isReg()) - validReg = Expr.AddMachineReg(*Asm->MF->getSubtarget().getRegisterInfo(), - Location.getReg()); - else - validReg = - Expr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(), - Location.getReg(), Location.getOffset()); - - if (!validReg) - return; - + DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc); + if (Location.isIndirect()) + DwarfExpr.setMemoryLocationKind(); + + SmallVector<uint64_t, 9> Ops; + if (Location.isIndirect() && Location.getOffset()) { + Ops.push_back(dwarf::DW_OP_plus_uconst); + Ops.push_back(Location.getOffset()); + } // If we started with a pointer to the __Block_byref... struct, then // the first thing we need to do is dereference the pointer (DW_OP_deref). if (isPointer) - DIExpr.push_back(dwarf::DW_OP_deref); + Ops.push_back(dwarf::DW_OP_deref); // Next add the offset for the '__forwarding' field: // DW_OP_plus_uconst ForwardingFieldOffset. Note there's no point in // adding the offset if it's 0. if (forwardingFieldOffset > 0) { - DIExpr.push_back(dwarf::DW_OP_plus); - DIExpr.push_back(forwardingFieldOffset); + Ops.push_back(dwarf::DW_OP_plus_uconst); + Ops.push_back(forwardingFieldOffset); } // Now dereference the __forwarding field to get to the real __Block_byref // struct: DW_OP_deref. - DIExpr.push_back(dwarf::DW_OP_deref); + Ops.push_back(dwarf::DW_OP_deref); // Now that we've got the real __Block_byref... struct, add the offset // for the variable's field to get to the location of the actual variable: // DW_OP_plus_uconst varFieldOffset. Again, don't add if it's 0. if (varFieldOffset > 0) { - DIExpr.push_back(dwarf::DW_OP_plus); - DIExpr.push_back(varFieldOffset); + Ops.push_back(dwarf::DW_OP_plus_uconst); + Ops.push_back(varFieldOffset); } - Expr.AddExpression(makeArrayRef(DIExpr)); - Expr.finalize(); + + DIExpressionCursor Cursor(Ops); + const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo(); + if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg())) + return; + DwarfExpr.addExpression(std::move(Cursor)); // Now attach the location information to the DIE. - addBlock(Die, Attribute, Loc); + addBlock(Die, Attribute, DwarfExpr.finalize()); } /// Return true if type encoding is unsigned. @@ -645,7 +647,7 @@ void DwarfUnit::addLinkageName(DIE &Die, StringRef LinkageName) { addString(Die, DD->getDwarfVersion() >= 4 ? dwarf::DW_AT_linkage_name : dwarf::DW_AT_MIPS_linkage_name, - GlobalValue::getRealLinkageName(LinkageName)); + GlobalValue::dropLLVMManglingEscape(LinkageName)); } void DwarfUnit::addTemplateParams(DIE &Buffer, DINodeArray TParams) { @@ -658,6 +660,14 @@ void DwarfUnit::addTemplateParams(DIE &Buffer, DINodeArray TParams) { } } +/// Add thrown types. +void DwarfUnit::addThrownTypes(DIE &Die, DINodeArray ThrownTypes) { + for (const auto *Ty : ThrownTypes) { + DIE &TT = createAndAddDIE(dwarf::DW_TAG_thrown_type, Die); + addType(TT, cast<DIType>(Ty)); + } +} + DIE *DwarfUnit::getOrCreateContextDIE(const DIScope *Context) { if (!Context || isa<DIFile>(Context)) return &getUnitDie(); @@ -672,7 +682,7 @@ DIE *DwarfUnit::getOrCreateContextDIE(const DIScope *Context) { return getDIE(Context); } -DIE *DwarfUnit::createTypeDIE(const DICompositeType *Ty) { +DIE *DwarfTypeUnit::createTypeDIE(const DICompositeType *Ty) { auto *Context = resolve(Ty->getScope()); DIE *ContextDIE = getOrCreateContextDIE(Context); @@ -684,8 +694,7 @@ DIE *DwarfUnit::createTypeDIE(const DICompositeType *Ty) { constructTypeDIE(TyDIE, cast<DICompositeType>(Ty)); - if (!Ty->isExternalTypeRef()) - updateAcceleratorTables(Context, Ty, TyDIE); + updateAcceleratorTables(Context, Ty, TyDIE); return &TyDIE; } @@ -841,6 +850,13 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy) { // Add source line info if available and TyDesc is not a forward declaration. if (!DTy->isForwardDecl()) addSourceLine(Buffer, DTy); + + // If DWARF address space value is other than None, add it for pointer and + // reference types as DW_AT_address_class. + if (DTy->getDWARFAddressSpace() && (Tag == dwarf::DW_TAG_pointer_type || + Tag == dwarf::DW_TAG_reference_type)) + addUInt(Buffer, dwarf::DW_AT_address_class, dwarf::DW_FORM_data4, + DTy->getDWARFAddressSpace().getValue()); } void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) { @@ -892,13 +908,6 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DISubroutineType *CTy) { } void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) { - if (CTy->isExternalTypeRef()) { - StringRef Identifier = CTy->getIdentifier(); - assert(!Identifier.empty() && "external type ref without identifier"); - addFlag(Buffer, dwarf::DW_AT_declaration); - return addDIETypeSignature(Buffer, dwarf::DW_AT_signature, Identifier); - } - // Add name if not anonymous or intermediate type. StringRef Name = CTy->getName(); @@ -1074,7 +1083,6 @@ DIE *DwarfUnit::getOrCreateNameSpace(const DINamespace *NS) { Name = "(anonymous namespace)"; DD->addAccelNamespace(Name, NDie); addGlobalName(Name, NDie, NS->getScope()); - addSourceLine(NDie, NS); if (NS->getExportSymbols()) addFlag(NDie, dwarf::DW_AT_export_symbols); return &NDie; @@ -1180,8 +1188,12 @@ bool DwarfUnit::applySubprogramDefinitionAttributes(const DISubprogram *SP, } void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie, - bool Minimal) { - if (!Minimal) + bool SkipSPAttributes) { + // If -fdebug-info-for-profiling is enabled, need to emit the subprogram + // and its source location. + bool SkipSPSourceLocation = SkipSPAttributes && + !CUNode->getDebugInfoForProfiling(); + if (!SkipSPSourceLocation) if (applySubprogramDefinitionAttributes(SP, SPDie)) return; @@ -1189,12 +1201,13 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie, if (!SP->getName().empty()) addString(SPDie, dwarf::DW_AT_name, SP->getName()); + if (!SkipSPSourceLocation) + addSourceLine(SPDie, SP); + // Skip the rest of the attributes under -gmlt to save space. - if (Minimal) + if (SkipSPAttributes) return; - addSourceLine(SPDie, SP); - // Add the prototype if we have a prototype and we have a C like // language. uint16_t Language = getLanguage(); @@ -1241,6 +1254,8 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie, constructSubprogramArguments(SPDie, Args); } + addThrownTypes(SPDie, SP->getThrownTypes()); + if (SP->isArtificial()) addFlag(SPDie, dwarf::DW_AT_artificial); @@ -1526,18 +1541,27 @@ DIE *DwarfUnit::getOrCreateStaticMemberDIE(const DIDerivedType *DT) { return &StaticMemberDIE; } -void DwarfUnit::emitHeader(bool UseOffsets) { +void DwarfUnit::emitCommonHeader(bool UseOffsets, dwarf::UnitType UT) { // Emit size of content not including length itself Asm->OutStreamer->AddComment("Length of Unit"); Asm->EmitInt32(getHeaderSize() + getUnitDie().getSize()); Asm->OutStreamer->AddComment("DWARF version number"); - Asm->EmitInt16(DD->getDwarfVersion()); - Asm->OutStreamer->AddComment("Offset Into Abbrev. Section"); + unsigned Version = DD->getDwarfVersion(); + Asm->EmitInt16(Version); + + // DWARF v5 reorders the address size and adds a unit type. + if (Version >= 5) { + Asm->OutStreamer->AddComment("DWARF Unit Type"); + Asm->EmitInt8(UT); + Asm->OutStreamer->AddComment("Address Size (in bytes)"); + Asm->EmitInt8(Asm->MAI->getCodePointerSize()); + } // We share one abbreviations table across all units so it's always at the // start of the section. Use a relocatable offset where needed to ensure // linking doesn't invalidate that offset. + Asm->OutStreamer->AddComment("Offset Into Abbrev. Section"); const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering(); if (UseOffsets) Asm->EmitInt32(0); @@ -1545,12 +1569,16 @@ void DwarfUnit::emitHeader(bool UseOffsets) { Asm->emitDwarfSymbolReference( TLOF.getDwarfAbbrevSection()->getBeginSymbol(), false); - Asm->OutStreamer->AddComment("Address Size (in bytes)"); - Asm->EmitInt8(Asm->getDataLayout().getPointerSize()); + if (Version <= 4) { + Asm->OutStreamer->AddComment("Address Size (in bytes)"); + Asm->EmitInt8(Asm->MAI->getCodePointerSize()); + } } void DwarfTypeUnit::emitHeader(bool UseOffsets) { - DwarfUnit::emitHeader(UseOffsets); + DwarfUnit::emitCommonHeader(UseOffsets, + DD->useSplitDwarf() ? dwarf::DW_UT_split_type + : dwarf::DW_UT_type); Asm->OutStreamer->AddComment("Type Signature"); Asm->OutStreamer->EmitIntValue(TypeSignature, sizeof(TypeSignature)); Asm->OutStreamer->AddComment("Type DIE Offset"); @@ -1559,8 +1587,46 @@ void DwarfTypeUnit::emitHeader(bool UseOffsets) { sizeof(Ty->getOffset())); } +DIE::value_iterator +DwarfUnit::addSectionDelta(DIE &Die, dwarf::Attribute Attribute, + const MCSymbol *Hi, const MCSymbol *Lo) { + return Die.addValue(DIEValueAllocator, Attribute, + DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset + : dwarf::DW_FORM_data4, + new (DIEValueAllocator) DIEDelta(Hi, Lo)); +} + +DIE::value_iterator +DwarfUnit::addSectionLabel(DIE &Die, dwarf::Attribute Attribute, + const MCSymbol *Label, const MCSymbol *Sec) { + if (Asm->MAI->doesDwarfUseRelocationsAcrossSections()) + return addLabel(Die, Attribute, + DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset + : dwarf::DW_FORM_data4, + Label); + return addSectionDelta(Die, Attribute, Label, Sec); +} + bool DwarfTypeUnit::isDwoUnit() const { // Since there are no skeleton type units, all type units are dwo type units // when split DWARF is being used. return DD->useSplitDwarf(); } + +void DwarfTypeUnit::addGlobalName(StringRef Name, const DIE &Die, + const DIScope *Context) { + getCU().addGlobalNameForTypeUnit(Name, Context); +} + +void DwarfTypeUnit::addGlobalType(const DIType *Ty, const DIE &Die, + const DIScope *Context) { + getCU().addGlobalTypeUnitType(Ty, Context); +} + +const MCSymbol *DwarfUnit::getCrossSectionRelativeBaseAddress() const { + if (!Asm->MAI->doesDwarfUseRelocationsAcrossSections()) + return nullptr; + if (isDwoUnit()) + return nullptr; + return getSection()->getBeginSymbol(); +} diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h index 8654d6f..4cc01b3 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h @@ -65,7 +65,7 @@ public: //===----------------------------------------------------------------------===// /// This dwarf writer support class manages information associated with a /// source file. - class DwarfUnit : public DIEUnit { +class DwarfUnit : public DIEUnit { protected: /// MDNode for the compile unit. const DICompileUnit *CUNode; @@ -103,9 +103,10 @@ protected: bool applySubprogramDefinitionAttributes(const DISubprogram *SP, DIE &SPDie); -public: - virtual ~DwarfUnit(); + bool shareAcrossDWOCUs() const; + bool isShareableAcrossCUs(const DINode *D) const; +public: // Accessors. AsmPrinter* getAsmPrinter() const { return Asm; } uint16_t getLanguage() const { return CUNode->getSourceLanguage(); } @@ -124,12 +125,12 @@ public: std::string getParentContextString(const DIScope *Context) const; /// Add a new global name to the compile unit. - virtual void addGlobalName(StringRef Name, DIE &Die, const DIScope *Context) { - } + virtual void addGlobalName(StringRef Name, const DIE &Die, + const DIScope *Context) = 0; /// Add a new global type to the compile unit. virtual void addGlobalType(const DIType *Ty, const DIE &Die, - const DIScope *Context) {} + const DIScope *Context) = 0; /// Returns the DIE map slot for the specified debug variable. /// @@ -198,9 +199,6 @@ public: /// Add a type's DW_AT_signature and set the declaration flag. void addDIETypeSignature(DIE &Die, uint64_t Signature); - /// Add an attribute containing the type signature for a unique identifier. - void addDIETypeSignature(DIE &Die, dwarf::Attribute Attribute, - StringRef Identifier); /// Add block data. void addBlock(DIE &Die, dwarf::Attribute Attribute, DIELoc *Block); @@ -215,7 +213,6 @@ public: void addSourceLine(DIE &Die, const DIGlobalVariable *G); void addSourceLine(DIE &Die, const DISubprogram *SP); void addSourceLine(DIE &Die, const DIType *Ty); - void addSourceLine(DIE &Die, const DINamespace *NS); void addSourceLine(DIE &Die, const DIObjCProperty *Ty); /// Add constant value entry in variable DIE. @@ -235,6 +232,9 @@ public: /// Add template parameters in buffer. void addTemplateParams(DIE &Buffer, DINodeArray TParams); + /// Add thrown types. + void addThrownTypes(DIE &Die, DINodeArray ThrownTypes); + // FIXME: Should be reformulated in terms of addComplexAddress. /// Start with the address based on the location provided, and generate the /// DWARF information necessary to find the actual Block variable (navigating @@ -256,15 +256,12 @@ public: DIE *getOrCreateSubprogramDIE(const DISubprogram *SP, bool Minimal = false); void applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie, - bool Minimal = false); + bool SkipSPAttributes = false); /// Find existing DIE or create new DIE for the given type. DIE *getOrCreateTypeDIE(const MDNode *N); /// Get context owner's DIE. - DIE *createTypeDIE(const DICompositeType *Ty); - - /// Get context owner's DIE. DIE *getOrCreateContextDIE(const DIScope *Context); /// Construct DIEs for types that contain vtables. @@ -282,17 +279,30 @@ public: virtual unsigned getHeaderSize() const { return sizeof(int16_t) + // DWARF version number sizeof(int32_t) + // Offset Into Abbrev. Section - sizeof(int8_t); // Pointer Size (in bytes) + sizeof(int8_t) + // Pointer Size (in bytes) + (DD->getDwarfVersion() >= 5 ? sizeof(int8_t) + : 0); // DWARF v5 unit type } /// Emit the header for this unit, not including the initial length field. - virtual void emitHeader(bool UseOffsets); + virtual void emitHeader(bool UseOffsets) = 0; virtual DwarfCompileUnit &getCU() = 0; void constructTypeDIE(DIE &Buffer, const DICompositeType *CTy); + /// addSectionDelta - Add a label delta attribute data and value. + DIE::value_iterator addSectionDelta(DIE &Die, dwarf::Attribute Attribute, + const MCSymbol *Hi, const MCSymbol *Lo); + + /// Add a Dwarf section label attribute data and value. + DIE::value_iterator addSectionLabel(DIE &Die, dwarf::Attribute Attribute, + const MCSymbol *Label, + const MCSymbol *Sec); + protected: + ~DwarfUnit(); + /// Create new static data member DIE. DIE *getOrCreateStaticMemberDIE(const DIDerivedType *DT); @@ -306,6 +316,14 @@ protected: return Ref.resolve(); } + /// If this is a named finished type then include it in the list of types for + /// the accelerator tables. + void updateAcceleratorTables(const DIScope *Context, const DIType *Ty, + const DIE &TyDIE); + + /// Emit the common part of the header for this unit. + void emitCommonHeader(bool UseOffsets, dwarf::UnitType UT); + private: void constructTypeDIE(DIE &Buffer, const DIBasicType *BTy); void constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy); @@ -330,15 +348,11 @@ private: /// Set D as anonymous type for index which can be reused later. void setIndexTyDie(DIE *D) { IndexTyDie = D; } - /// If this is a named finished type then include it in the list of types for - /// the accelerator tables. - void updateAcceleratorTables(const DIScope *Context, const DIType *Ty, - const DIE &TyDIE); - virtual bool isDwoUnit() const = 0; + const MCSymbol *getCrossSectionRelativeBaseAddress() const override; }; -class DwarfTypeUnit : public DwarfUnit { +class DwarfTypeUnit final : public DwarfUnit { uint64_t TypeSignature; const DIE *Ty; DwarfCompileUnit &CU; @@ -354,12 +368,19 @@ public: void setTypeSignature(uint64_t Signature) { TypeSignature = Signature; } void setType(const DIE *Ty) { this->Ty = Ty; } + /// Get context owner's DIE. + DIE *createTypeDIE(const DICompositeType *Ty); + /// Emit the header for this unit, not including the initial length field. void emitHeader(bool UseOffsets) override; unsigned getHeaderSize() const override { return DwarfUnit::getHeaderSize() + sizeof(uint64_t) + // Type Signature sizeof(uint32_t); // Type DIE Offset } + void addGlobalName(StringRef Name, const DIE &Die, + const DIScope *Context) override; + void addGlobalType(const DIType *Ty, const DIE &Die, + const DIScope *Context) override; DwarfCompileUnit &getCU() override { return CU; } }; } // end llvm namespace diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp index 0a4a7a0..e14d5be 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp @@ -309,7 +309,7 @@ computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites, // If some instruction between the previous try-range and the end of the // function may throw, create a call-site entry with no landing pad for the // region following the try-range. - if (SawPotentiallyThrowing && !IsSJLJ && LastLabel != nullptr) { + if (SawPotentiallyThrowing && !IsSJLJ) { CallSiteEntry Site = { LastLabel, nullptr, nullptr, 0 }; CallSites.push_back(Site); } diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp index 6a023b9..c579555 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp @@ -1,4 +1,4 @@ -//===-- ErlangGCPrinter.cpp - Erlang/OTP frametable emitter -----*- C++ -*-===// +//===- ErlangGCPrinter.cpp - Erlang/OTP frametable emitter ----------------===// // // The LLVM Compiler Infrastructure // @@ -13,22 +13,20 @@ // //===----------------------------------------------------------------------===// +#include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/GCMetadata.h" #include "llvm/CodeGen/GCMetadataPrinter.h" +#include "llvm/CodeGen/GCStrategy.h" #include "llvm/CodeGen/GCs.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" -#include "llvm/IR/Instruction.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Metadata.h" -#include "llvm/MC/MCAsmInfo.h" +#include "llvm/IR/Module.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Target/TargetLoweringObjectFile.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; @@ -38,13 +36,12 @@ class ErlangGCPrinter : public GCMetadataPrinter { public: void finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) override; }; -} + +} // end anonymous namespace static GCMetadataPrinterRegistry::Add<ErlangGCPrinter> X("erlang", "erlang-compatible garbage collector"); -void llvm::linkErlangGCPrinter() {} - void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) { MCStreamer &OS = *AP.OutStreamer; @@ -121,3 +118,5 @@ void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info, } } } + +void llvm::linkErlangGCPrinter() {} diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp index 8baee4d..035f1a0 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp @@ -1,4 +1,4 @@ -//===-- OcamlGCPrinter.cpp - Ocaml frametable emitter ---------------------===// +//===- OcamlGCPrinter.cpp - Ocaml frametable emitter ----------------------===// // // The LLVM Compiler Infrastructure // @@ -11,23 +11,27 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/GCs.h" #include "llvm/ADT/SmallString.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Twine.h" #include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/GCMetadata.h" #include "llvm/CodeGen/GCMetadataPrinter.h" +#include "llvm/CodeGen/GCs.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" #include "llvm/IR/Mangler.h" #include "llvm/IR/Module.h" -#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDirectives.h" #include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSymbol.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/FormattedStream.h" #include "llvm/Target/TargetLoweringObjectFile.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetSubtargetInfo.h" #include <cctype> +#include <cstddef> +#include <cstdint> +#include <string> + using namespace llvm; namespace { @@ -37,7 +41,8 @@ public: void beginAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) override; void finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) override; }; -} + +} // end anonymous namespace static GCMetadataPrinterRegistry::Add<OcamlGCMetadataPrinter> Y("ocaml", "ocaml 3.10-compatible collector"); @@ -50,7 +55,7 @@ static void EmitCamlGlobal(const Module &M, AsmPrinter &AP, const char *Id) { std::string SymName; SymName += "caml"; size_t Letter = SymName.size(); - SymName.append(MId.begin(), find(MId, '.')); + SymName.append(MId.begin(), llvm::find(MId, '.')); SymName += "__"; SymName += Id; diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp index 9d7c96a..5d485f2 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp @@ -14,6 +14,8 @@ #include "WinException.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Twine.h" +#include "llvm/BinaryFormat/COFF.h" +#include "llvm/BinaryFormat/Dwarf.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -29,8 +31,6 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCWin64EH.h" -#include "llvm/Support/COFF.h" -#include "llvm/Support/Dwarf.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Target/TargetFrameLowering.h" @@ -68,7 +68,7 @@ void WinException::beginFunction(const MachineFunction *MF) { const Function *F = MF->getFunction(); - shouldEmitMoves = Asm->needsSEHMoves(); + shouldEmitMoves = Asm->needsSEHMoves() && MF->hasWinCFI(); const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering(); unsigned PerEncoding = TLOF.getPersonalityEncoding(); @@ -94,14 +94,14 @@ void WinException::beginFunction(const MachineFunction *MF) { // If we're not using CFI, we don't want the CFI or the personality, but we // might want EH tables if we had EH pads. - if (!Asm->MAI->usesWindowsCFI() || (!MF->hasWinCFI() && !PerFn)) { + if (!Asm->MAI->usesWindowsCFI()) { if (Per == EHPersonality::MSVC_X86SEH && !hasEHFunclets) { // If this is 32-bit SEH and we don't have any funclets (really invokes), // make sure we emit the parent offset label. Some unreferenced filter // functions may still refer to it. const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo(); StringRef FLinkageName = - GlobalValue::getRealLinkageName(MF->getFunction()->getName()); + GlobalValue::dropLLVMManglingEscape(MF->getFunction()->getName()); emitEHRegistrationOffsetLabel(FuncInfo, FLinkageName); } shouldEmitLSDA = hasEHFunclets; @@ -174,7 +174,7 @@ static MCSymbol *getMCSymbolForMBB(AsmPrinter *Asm, // their funclet entry block's number. const MachineFunction *MF = MBB->getParent(); const Function *F = MF->getFunction(); - StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName()); + StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName()); MCContext &Ctx = MF->getContext(); StringRef HandlerPrefix = MBB->isCleanupFuncletEntry() ? "dtor" : "catch"; return Ctx.getOrCreateSymbol("?" + HandlerPrefix + "$" + @@ -252,7 +252,7 @@ void WinException::endFunclet() { !CurrentFuncletEntry->isCleanupFuncletEntry()) { // If this is a C++ catch funclet (or the parent function), // emit a reference to the LSDA for the parent function. - StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName()); + StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName()); MCSymbol *FuncInfoXData = Asm->OutContext.getOrCreateSymbol( Twine("$cppxdata$", FuncLinkageName)); Asm->OutStreamer->EmitValue(create32bitRef(FuncInfoXData), 4); @@ -536,7 +536,7 @@ void WinException::emitCSpecificHandlerTable(const MachineFunction *MF) { // Emit a label assignment with the SEH frame offset so we can use it for // llvm.x86.seh.recoverfp. StringRef FLinkageName = - GlobalValue::getRealLinkageName(MF->getFunction()->getName()); + GlobalValue::dropLLVMManglingEscape(MF->getFunction()->getName()); MCSymbol *ParentFrameOffset = Ctx.getOrCreateParentFrameOffsetSymbol(FLinkageName); const MCExpr *MCOffset = @@ -635,7 +635,7 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) { auto &OS = *Asm->OutStreamer; const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo(); - StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName()); + StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName()); SmallVector<std::pair<const MCExpr *, int>, 4> IPToStateTable; MCSymbol *FuncInfoXData = nullptr; @@ -942,7 +942,7 @@ void WinException::emitEHRegistrationOffsetLabel(const WinEHFuncInfo &FuncInfo, void WinException::emitExceptHandlerTable(const MachineFunction *MF) { MCStreamer &OS = *Asm->OutStreamer; const Function *F = MF->getFunction(); - StringRef FLinkageName = GlobalValue::getRealLinkageName(F->getName()); + StringRef FLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName()); bool VerboseAsm = OS.isVerboseAsm(); auto AddComment = [&](const Twine &Comment) { diff --git a/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp b/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp index bf5cf10..aa9c8e9 100644 --- a/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -17,6 +17,7 @@ #include "llvm/CodeGen/AtomicExpandUtils.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" @@ -35,20 +36,17 @@ using namespace llvm; namespace { class AtomicExpand: public FunctionPass { - const TargetMachine *TM; const TargetLowering *TLI; public: static char ID; // Pass identification, replacement for typeid - explicit AtomicExpand(const TargetMachine *TM = nullptr) - : FunctionPass(ID), TM(TM), TLI(nullptr) { + AtomicExpand() : FunctionPass(ID), TLI(nullptr) { initializeAtomicExpandPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; private: - bool bracketInstWithFences(Instruction *I, AtomicOrdering Order, - bool IsStore, bool IsLoad); + bool bracketInstWithFences(Instruction *I, AtomicOrdering Order); IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI); bool tryExpandAtomicLoad(LoadInst *LI); @@ -98,12 +96,10 @@ namespace { char AtomicExpand::ID = 0; char &llvm::AtomicExpandID = AtomicExpand::ID; -INITIALIZE_TM_PASS(AtomicExpand, "atomic-expand", "Expand Atomic instructions", - false, false) +INITIALIZE_PASS(AtomicExpand, DEBUG_TYPE, "Expand Atomic instructions", + false, false) -FunctionPass *llvm::createAtomicExpandPass(const TargetMachine *TM) { - return new AtomicExpand(TM); -} +FunctionPass *llvm::createAtomicExpandPass() { return new AtomicExpand(); } namespace { // Helper functions to retrieve the size of atomic instructions. @@ -173,9 +169,14 @@ bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) { } // end anonymous namespace bool AtomicExpand::runOnFunction(Function &F) { - if (!TM || !TM->getSubtargetImpl(F)->enableAtomicExpand()) + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + return false; + + auto &TM = TPC->getTM<TargetMachine>(); + if (!TM.getSubtargetImpl(F)->enableAtomicExpand()) return false; - TLI = TM->getSubtargetImpl(F)->getTargetLowering(); + TLI = TM.getSubtargetImpl(F)->getTargetLowering(); SmallVector<Instruction *, 1> AtomicInsts; @@ -224,22 +225,16 @@ bool AtomicExpand::runOnFunction(Function &F) { if (TLI->shouldInsertFencesForAtomic(I)) { auto FenceOrdering = AtomicOrdering::Monotonic; - bool IsStore, IsLoad; if (LI && isAcquireOrStronger(LI->getOrdering())) { FenceOrdering = LI->getOrdering(); LI->setOrdering(AtomicOrdering::Monotonic); - IsStore = false; - IsLoad = true; } else if (SI && isReleaseOrStronger(SI->getOrdering())) { FenceOrdering = SI->getOrdering(); SI->setOrdering(AtomicOrdering::Monotonic); - IsStore = true; - IsLoad = false; } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) || isAcquireOrStronger(RMWI->getOrdering()))) { FenceOrdering = RMWI->getOrdering(); RMWI->setOrdering(AtomicOrdering::Monotonic); - IsStore = IsLoad = true; } else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) && (isReleaseOrStronger(CASI->getSuccessOrdering()) || isAcquireOrStronger(CASI->getSuccessOrdering()))) { @@ -250,11 +245,10 @@ bool AtomicExpand::runOnFunction(Function &F) { FenceOrdering = CASI->getSuccessOrdering(); CASI->setSuccessOrdering(AtomicOrdering::Monotonic); CASI->setFailureOrdering(AtomicOrdering::Monotonic); - IsStore = IsLoad = true; } if (FenceOrdering != AtomicOrdering::Monotonic) { - MadeChange |= bracketInstWithFences(I, FenceOrdering, IsStore, IsLoad); + MadeChange |= bracketInstWithFences(I, FenceOrdering); } } @@ -320,13 +314,12 @@ bool AtomicExpand::runOnFunction(Function &F) { return MadeChange; } -bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order, - bool IsStore, bool IsLoad) { +bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order) { IRBuilder<> Builder(I); - auto LeadingFence = TLI->emitLeadingFence(Builder, Order, IsStore, IsLoad); + auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order); - auto TrailingFence = TLI->emitTrailingFence(Builder, Order, IsStore, IsLoad); + auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order); // The trailing fence is emitted before the instruction instead of after // because there is no easy way of setting Builder insertion point after // an instruction. So we must erase it from the BB, and insert it back @@ -368,7 +361,7 @@ LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) { auto *NewLI = Builder.CreateLoad(NewAddr); NewLI->setAlignment(LI->getAlignment()); NewLI->setVolatile(LI->isVolatile()); - NewLI->setAtomic(LI->getOrdering(), LI->getSynchScope()); + NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID()); DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n"); Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType()); @@ -451,7 +444,7 @@ StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) { StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr); NewSI->setAlignment(SI->getAlignment()); NewSI->setVolatile(SI->isVolatile()); - NewSI->setAtomic(SI->getOrdering(), SI->getSynchScope()); + NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID()); DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n"); SI->eraseFromParent(); return NewSI; @@ -808,7 +801,7 @@ void AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) { Value *FullWord_Cmp = Builder.CreateOr(Loaded_MaskOut, Cmp_Shifted); AtomicCmpXchgInst *NewCI = Builder.CreateAtomicCmpXchg( PMV.AlignedAddr, FullWord_Cmp, FullWord_NewVal, CI->getSuccessOrdering(), - CI->getFailureOrdering(), CI->getSynchScope()); + CI->getFailureOrdering(), CI->getSyncScopeID()); NewCI->setVolatile(CI->isVolatile()); // When we're building a strong cmpxchg, we need a loop, so you // might think we could use a weak cmpxchg inside. But, using strong @@ -931,7 +924,7 @@ AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst * auto *NewCI = Builder.CreateAtomicCmpXchg(NewAddr, NewCmp, NewNewVal, CI->getSuccessOrdering(), CI->getFailureOrdering(), - CI->getSynchScope()); + CI->getSyncScopeID()); NewCI->setVolatile(CI->isVolatile()); NewCI->setWeak(CI->isWeak()); DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n"); @@ -1048,8 +1041,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { std::prev(BB->end())->eraseFromParent(); Builder.SetInsertPoint(BB); if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier) - TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true, - /*IsLoad=*/true); + TLI->emitLeadingFence(Builder, CI, SuccessOrder); Builder.CreateBr(StartBB); // Start the main loop block now that we've taken care of the preliminaries. @@ -1064,8 +1056,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { Builder.SetInsertPoint(ReleasingStoreBB); if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier) - TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true, - /*IsLoad=*/true); + TLI->emitLeadingFence(Builder, CI, SuccessOrder); Builder.CreateBr(TryStoreBB); Builder.SetInsertPoint(TryStoreBB); @@ -1094,8 +1085,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { // necessary. Builder.SetInsertPoint(SuccessBB); if (ShouldInsertFencesForAtomic) - TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true, - /*IsLoad=*/true); + TLI->emitTrailingFence(Builder, CI, SuccessOrder); Builder.CreateBr(ExitBB); Builder.SetInsertPoint(NoStoreBB); @@ -1107,8 +1097,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { Builder.SetInsertPoint(FailureBB); if (ShouldInsertFencesForAtomic) - TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true, - /*IsLoad=*/true); + TLI->emitTrailingFence(Builder, CI, FailureOrder); Builder.CreateBr(ExitBB); // Finally, we have control-flow based knowledge of whether the cmpxchg @@ -1532,7 +1521,7 @@ bool AtomicExpand::expandAtomicOpToLibcall( Type *ResultTy; SmallVector<Value *, 6> Args; - AttributeSet Attr; + AttributeList Attr; // 'size' argument. if (!UseSizedLibcall) { @@ -1593,7 +1582,7 @@ bool AtomicExpand::expandAtomicOpToLibcall( // Now, the return type. if (CASExpected) { ResultTy = Type::getInt1Ty(Ctx); - Attr = Attr.addAttribute(Ctx, AttributeSet::ReturnIndex, Attribute::ZExt); + Attr = Attr.addAttribute(Ctx, AttributeList::ReturnIndex, Attribute::ZExt); } else if (HasResult && UseSizedLibcall) ResultTy = SizedIntTy; else diff --git a/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp b/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp index a67e194..be93ff0 100644 --- a/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp +++ b/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp @@ -15,17 +15,15 @@ /// //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/TargetTransformInfoImpl.h" +#include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/CodeGen/Passes.h" #include "llvm/Support/CommandLine.h" #include <utility> using namespace llvm; -#define DEBUG_TYPE "basictti" - // This flag is used by the template base class for BasicTTIImpl, and here to // provide a definition. cl::opt<unsigned> diff --git a/contrib/llvm/lib/CodeGen/BranchCoalescing.cpp b/contrib/llvm/lib/CodeGen/BranchCoalescing.cpp new file mode 100644 index 0000000..2c41b59 --- /dev/null +++ b/contrib/llvm/lib/CodeGen/BranchCoalescing.cpp @@ -0,0 +1,758 @@ +//===-- CoalesceBranches.cpp - Coalesce blocks with the same condition ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Coalesce basic blocks guarded by the same branch condition into a single +/// basic block. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "branch-coalescing" + +static cl::opt<cl::boolOrDefault> + EnableBranchCoalescing("enable-branch-coalesce", cl::Hidden, + cl::desc("enable coalescing of duplicate branches")); + +STATISTIC(NumBlocksCoalesced, "Number of blocks coalesced"); +STATISTIC(NumPHINotMoved, "Number of PHI Nodes that cannot be merged"); +STATISTIC(NumBlocksNotCoalesced, "Number of blocks not coalesced"); + +//===----------------------------------------------------------------------===// +// BranchCoalescing +//===----------------------------------------------------------------------===// +/// +/// Improve scheduling by coalescing branches that depend on the same condition. +/// This pass looks for blocks that are guarded by the same branch condition +/// and attempts to merge the blocks together. Such opportunities arise from +/// the expansion of select statements in the IR. +/// +/// For example, consider the following LLVM IR: +/// +/// %test = icmp eq i32 %x 0 +/// %tmp1 = select i1 %test, double %a, double 2.000000e-03 +/// %tmp2 = select i1 %test, double %b, double 5.000000e-03 +/// +/// This IR expands to the following machine code on PowerPC: +/// +/// BB#0: derived from LLVM BB %entry +/// Live Ins: %F1 %F3 %X6 +/// <SNIP1> +/// %vreg0<def> = COPY %F1; F8RC:%vreg0 +/// %vreg5<def> = CMPLWI %vreg4<kill>, 0; CRRC:%vreg5 GPRC:%vreg4 +/// %vreg8<def> = LXSDX %ZERO8, %vreg7<kill>, %RM<imp-use>; +/// mem:LD8[ConstantPool] F8RC:%vreg8 G8RC:%vreg7 +/// BCC 76, %vreg5, <BB#2>; CRRC:%vreg5 +/// Successors according to CFG: BB#1(?%) BB#2(?%) +/// +/// BB#1: derived from LLVM BB %entry +/// Predecessors according to CFG: BB#0 +/// Successors according to CFG: BB#2(?%) +/// +/// BB#2: derived from LLVM BB %entry +/// Predecessors according to CFG: BB#0 BB#1 +/// %vreg9<def> = PHI %vreg8, <BB#1>, %vreg0, <BB#0>; +/// F8RC:%vreg9,%vreg8,%vreg0 +/// <SNIP2> +/// BCC 76, %vreg5, <BB#4>; CRRC:%vreg5 +/// Successors according to CFG: BB#3(?%) BB#4(?%) +/// +/// BB#3: derived from LLVM BB %entry +/// Predecessors according to CFG: BB#2 +/// Successors according to CFG: BB#4(?%) +/// +/// BB#4: derived from LLVM BB %entry +/// Predecessors according to CFG: BB#2 BB#3 +/// %vreg13<def> = PHI %vreg12, <BB#3>, %vreg2, <BB#2>; +/// F8RC:%vreg13,%vreg12,%vreg2 +/// <SNIP3> +/// BLR8 %LR8<imp-use>, %RM<imp-use>, %F1<imp-use> +/// +/// When this pattern is detected, branch coalescing will try to collapse +/// it by moving code in BB#2 to BB#0 and/or BB#4 and removing BB#3. +/// +/// If all conditions are meet, IR should collapse to: +/// +/// BB#0: derived from LLVM BB %entry +/// Live Ins: %F1 %F3 %X6 +/// <SNIP1> +/// %vreg0<def> = COPY %F1; F8RC:%vreg0 +/// %vreg5<def> = CMPLWI %vreg4<kill>, 0; CRRC:%vreg5 GPRC:%vreg4 +/// %vreg8<def> = LXSDX %ZERO8, %vreg7<kill>, %RM<imp-use>; +/// mem:LD8[ConstantPool] F8RC:%vreg8 G8RC:%vreg7 +/// <SNIP2> +/// BCC 76, %vreg5, <BB#4>; CRRC:%vreg5 +/// Successors according to CFG: BB#1(0x2aaaaaaa / 0x80000000 = 33.33%) +/// BB#4(0x55555554 / 0x80000000 = 66.67%) +/// +/// BB#1: derived from LLVM BB %entry +/// Predecessors according to CFG: BB#0 +/// Successors according to CFG: BB#4(0x40000000 / 0x80000000 = 50.00%) +/// +/// BB#4: derived from LLVM BB %entry +/// Predecessors according to CFG: BB#0 BB#1 +/// %vreg9<def> = PHI %vreg8, <BB#1>, %vreg0, <BB#0>; +/// F8RC:%vreg9,%vreg8,%vreg0 +/// %vreg13<def> = PHI %vreg12, <BB#1>, %vreg2, <BB#0>; +/// F8RC:%vreg13,%vreg12,%vreg2 +/// <SNIP3> +/// BLR8 %LR8<imp-use>, %RM<imp-use>, %F1<imp-use> +/// +/// Branch Coalescing does not split blocks, it moves everything in the same +/// direction ensuring it does not break use/definition semantics. +/// +/// PHI nodes and its corresponding use instructions are moved to its successor +/// block if there are no uses within the successor block PHI nodes. PHI +/// node ordering cannot be assumed. +/// +/// Non-PHI can be moved up to the predecessor basic block or down to the +/// successor basic block following any PHI instructions. Whether it moves +/// up or down depends on whether the register(s) defined in the instructions +/// are used in current block or in any PHI instructions at the beginning of +/// the successor block. + +namespace { + +class BranchCoalescing : public MachineFunctionPass { + struct CoalescingCandidateInfo { + MachineBasicBlock *BranchBlock; // Block containing the branch + MachineBasicBlock *BranchTargetBlock; // Block branched to + MachineBasicBlock *FallThroughBlock; // Fall-through if branch not taken + SmallVector<MachineOperand, 4> Cond; + bool MustMoveDown; + bool MustMoveUp; + + CoalescingCandidateInfo(); + void clear(); + }; + + MachineDominatorTree *MDT; + MachinePostDominatorTree *MPDT; + const TargetInstrInfo *TII; + MachineRegisterInfo *MRI; + + void initialize(MachineFunction &F); + bool canCoalesceBranch(CoalescingCandidateInfo &Cand); + bool identicalOperands(ArrayRef<MachineOperand> OperandList1, + ArrayRef<MachineOperand> OperandList2) const; + bool validateCandidates(CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion) const; + + static bool isBranchCoalescingEnabled() { + return EnableBranchCoalescing == cl::BOU_TRUE; + } + +public: + static char ID; + + BranchCoalescing() : MachineFunctionPass(ID) { + initializeBranchCoalescingPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachinePostDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { return "Branch Coalescing"; } + + bool mergeCandidates(CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion); + bool canMoveToBeginning(const MachineInstr &MI, + const MachineBasicBlock &MBB) const; + bool canMoveToEnd(const MachineInstr &MI, + const MachineBasicBlock &MBB) const; + bool canMerge(CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion) const; + void moveAndUpdatePHIs(MachineBasicBlock *SourceRegionMBB, + MachineBasicBlock *TargetRegionMBB); + bool runOnMachineFunction(MachineFunction &MF) override; +}; +} // End anonymous namespace. + +char BranchCoalescing::ID = 0; +char &llvm::BranchCoalescingID = BranchCoalescing::ID; + +INITIALIZE_PASS_BEGIN(BranchCoalescing, DEBUG_TYPE, + "Branch Coalescing", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_END(BranchCoalescing, DEBUG_TYPE, "Branch Coalescing", + false, false) + +BranchCoalescing::CoalescingCandidateInfo::CoalescingCandidateInfo() + : BranchBlock(nullptr), BranchTargetBlock(nullptr), + FallThroughBlock(nullptr), MustMoveDown(false), MustMoveUp(false) {} + +void BranchCoalescing::CoalescingCandidateInfo::clear() { + BranchBlock = nullptr; + BranchTargetBlock = nullptr; + FallThroughBlock = nullptr; + Cond.clear(); + MustMoveDown = false; + MustMoveUp = false; +} + +void BranchCoalescing::initialize(MachineFunction &MF) { + MDT = &getAnalysis<MachineDominatorTree>(); + MPDT = &getAnalysis<MachinePostDominatorTree>(); + TII = MF.getSubtarget().getInstrInfo(); + MRI = &MF.getRegInfo(); +} + +/// +/// Analyze the branch statement to determine if it can be coalesced. This +/// method analyses the branch statement for the given candidate to determine +/// if it can be coalesced. If the branch can be coalesced, then the +/// BranchTargetBlock and the FallThroughBlock are recorded in the specified +/// Candidate. +/// +///\param[in,out] Cand The coalescing candidate to analyze +///\return true if and only if the branch can be coalesced, false otherwise +/// +bool BranchCoalescing::canCoalesceBranch(CoalescingCandidateInfo &Cand) { + DEBUG(dbgs() << "Determine if branch block " << Cand.BranchBlock->getNumber() + << " can be coalesced:"); + MachineBasicBlock *FalseMBB = nullptr; + + if (TII->analyzeBranch(*Cand.BranchBlock, Cand.BranchTargetBlock, FalseMBB, + Cand.Cond)) { + DEBUG(dbgs() << "TII unable to Analyze Branch - skip\n"); + return false; + } + + for (auto &I : Cand.BranchBlock->terminators()) { + DEBUG(dbgs() << "Looking at terminator : " << I << "\n"); + if (!I.isBranch()) + continue; + + if (I.getNumOperands() != I.getNumExplicitOperands()) { + DEBUG(dbgs() << "Terminator contains implicit operands - skip : " << I + << "\n"); + return false; + } + } + + if (Cand.BranchBlock->isEHPad() || Cand.BranchBlock->hasEHPadSuccessor()) { + DEBUG(dbgs() << "EH Pad - skip\n"); + return false; + } + + // For now only consider triangles (i.e, BranchTargetBlock is set, + // FalseMBB is null, and BranchTargetBlock is a successor to BranchBlock) + if (!Cand.BranchTargetBlock || FalseMBB || + !Cand.BranchBlock->isSuccessor(Cand.BranchTargetBlock)) { + DEBUG(dbgs() << "Does not form a triangle - skip\n"); + return false; + } + + // Ensure there are only two successors + if (Cand.BranchBlock->succ_size() != 2) { + DEBUG(dbgs() << "Does not have 2 successors - skip\n"); + return false; + } + + // Sanity check - the block must be able to fall through + assert(Cand.BranchBlock->canFallThrough() && + "Expecting the block to fall through!"); + + // We have already ensured there are exactly two successors to + // BranchBlock and that BranchTargetBlock is a successor to BranchBlock. + // Ensure the single fall though block is empty. + MachineBasicBlock *Succ = + (*Cand.BranchBlock->succ_begin() == Cand.BranchTargetBlock) + ? *Cand.BranchBlock->succ_rbegin() + : *Cand.BranchBlock->succ_begin(); + + assert(Succ && "Expecting a valid fall-through block\n"); + + if (!Succ->empty()) { + DEBUG(dbgs() << "Fall-through block contains code -- skip\n"); + return false; + } + + if (!Succ->isSuccessor(Cand.BranchTargetBlock)) { + DEBUG(dbgs() + << "Successor of fall through block is not branch taken block\n"); + return false; + } + + Cand.FallThroughBlock = Succ; + DEBUG(dbgs() << "Valid Candidate\n"); + return true; +} + +/// +/// Determine if the two operand lists are identical +/// +/// \param[in] OpList1 operand list +/// \param[in] OpList2 operand list +/// \return true if and only if the operands lists are identical +/// +bool BranchCoalescing::identicalOperands( + ArrayRef<MachineOperand> OpList1, ArrayRef<MachineOperand> OpList2) const { + + if (OpList1.size() != OpList2.size()) { + DEBUG(dbgs() << "Operand list is different size\n"); + return false; + } + + for (unsigned i = 0; i < OpList1.size(); ++i) { + const MachineOperand &Op1 = OpList1[i]; + const MachineOperand &Op2 = OpList2[i]; + + DEBUG(dbgs() << "Op1: " << Op1 << "\n" + << "Op2: " << Op2 << "\n"); + + if (Op1.isIdenticalTo(Op2)) { + DEBUG(dbgs() << "Op1 and Op2 are identical!\n"); + continue; + } + + // If the operands are not identical, but are registers, check to see if the + // definition of the register produces the same value. If they produce the + // same value, consider them to be identical. + if (Op1.isReg() && Op2.isReg() && + TargetRegisterInfo::isVirtualRegister(Op1.getReg()) && + TargetRegisterInfo::isVirtualRegister(Op2.getReg())) { + MachineInstr *Op1Def = MRI->getVRegDef(Op1.getReg()); + MachineInstr *Op2Def = MRI->getVRegDef(Op2.getReg()); + if (TII->produceSameValue(*Op1Def, *Op2Def, MRI)) { + DEBUG(dbgs() << "Op1Def: " << *Op1Def << " and " << *Op2Def + << " produce the same value!\n"); + } else { + DEBUG(dbgs() << "Operands produce different values\n"); + return false; + } + } else { + DEBUG(dbgs() << "The operands are not provably identical.\n"); + return false; + } + } + return true; +} + +/// +/// Moves ALL PHI instructions in SourceMBB to beginning of TargetMBB +/// and update them to refer to the new block. PHI node ordering +/// cannot be assumed so it does not matter where the PHI instructions +/// are moved to in TargetMBB. +/// +/// \param[in] SourceMBB block to move PHI instructions from +/// \param[in] TargetMBB block to move PHI instructions to +/// +void BranchCoalescing::moveAndUpdatePHIs(MachineBasicBlock *SourceMBB, + MachineBasicBlock *TargetMBB) { + + MachineBasicBlock::iterator MI = SourceMBB->begin(); + MachineBasicBlock::iterator ME = SourceMBB->getFirstNonPHI(); + + if (MI == ME) { + DEBUG(dbgs() << "SourceMBB contains no PHI instructions.\n"); + return; + } + + // Update all PHI instructions in SourceMBB and move to top of TargetMBB + for (MachineBasicBlock::iterator Iter = MI; Iter != ME; Iter++) { + MachineInstr &PHIInst = *Iter; + for (unsigned i = 2, e = PHIInst.getNumOperands() + 1; i != e; i += 2) { + MachineOperand &MO = PHIInst.getOperand(i); + if (MO.getMBB() == SourceMBB) + MO.setMBB(TargetMBB); + } + } + TargetMBB->splice(TargetMBB->begin(), SourceMBB, MI, ME); +} + +/// +/// This function checks if MI can be moved to the beginning of the TargetMBB +/// following PHI instructions. A MI instruction can be moved to beginning of +/// the TargetMBB if there are no uses of it within the TargetMBB PHI nodes. +/// +/// \param[in] MI the machine instruction to move. +/// \param[in] TargetMBB the machine basic block to move to +/// \return true if it is safe to move MI to beginning of TargetMBB, +/// false otherwise. +/// +bool BranchCoalescing::canMoveToBeginning(const MachineInstr &MI, + const MachineBasicBlock &TargetMBB + ) const { + + DEBUG(dbgs() << "Checking if " << MI << " can move to beginning of " + << TargetMBB.getNumber() << "\n"); + + for (auto &Def : MI.defs()) { // Looking at Def + for (auto &Use : MRI->use_instructions(Def.getReg())) { + if (Use.isPHI() && Use.getParent() == &TargetMBB) { + DEBUG(dbgs() << " *** used in a PHI -- cannot move ***\n"); + return false; + } + } + } + + DEBUG(dbgs() << " Safe to move to the beginning.\n"); + return true; +} + +/// +/// This function checks if MI can be moved to the end of the TargetMBB, +/// immediately before the first terminator. A MI instruction can be moved +/// to then end of the TargetMBB if no PHI node defines what MI uses within +/// it's own MBB. +/// +/// \param[in] MI the machine instruction to move. +/// \param[in] TargetMBB the machine basic block to move to +/// \return true if it is safe to move MI to end of TargetMBB, +/// false otherwise. +/// +bool BranchCoalescing::canMoveToEnd(const MachineInstr &MI, + const MachineBasicBlock &TargetMBB + ) const { + + DEBUG(dbgs() << "Checking if " << MI << " can move to end of " + << TargetMBB.getNumber() << "\n"); + + for (auto &Use : MI.uses()) { + if (Use.isReg() && TargetRegisterInfo::isVirtualRegister(Use.getReg())) { + MachineInstr *DefInst = MRI->getVRegDef(Use.getReg()); + if (DefInst->isPHI() && DefInst->getParent() == MI.getParent()) { + DEBUG(dbgs() << " *** Cannot move this instruction ***\n"); + return false; + } else { + DEBUG(dbgs() << " *** def is in another block -- safe to move!\n"); + } + } + } + + DEBUG(dbgs() << " Safe to move to the end.\n"); + return true; +} + +/// +/// This method checks to ensure the two coalescing candidates follows the +/// expected pattern required for coalescing. +/// +/// \param[in] SourceRegion The candidate to move statements from +/// \param[in] TargetRegion The candidate to move statements to +/// \return true if all instructions in SourceRegion.BranchBlock can be merged +/// into a block in TargetRegion; false otherwise. +/// +bool BranchCoalescing::validateCandidates( + CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion) const { + + if (TargetRegion.BranchTargetBlock != SourceRegion.BranchBlock) + llvm_unreachable("Expecting SourceRegion to immediately follow TargetRegion"); + else if (!MDT->dominates(TargetRegion.BranchBlock, SourceRegion.BranchBlock)) + llvm_unreachable("Expecting TargetRegion to dominate SourceRegion"); + else if (!MPDT->dominates(SourceRegion.BranchBlock, TargetRegion.BranchBlock)) + llvm_unreachable("Expecting SourceRegion to post-dominate TargetRegion"); + else if (!TargetRegion.FallThroughBlock->empty() || + !SourceRegion.FallThroughBlock->empty()) + llvm_unreachable("Expecting fall-through blocks to be empty"); + + return true; +} + +/// +/// This method determines whether the two coalescing candidates can be merged. +/// In order to be merged, all instructions must be able to +/// 1. Move to the beginning of the SourceRegion.BranchTargetBlock; +/// 2. Move to the end of the TargetRegion.BranchBlock. +/// Merging involves moving the instructions in the +/// TargetRegion.BranchTargetBlock (also SourceRegion.BranchBlock). +/// +/// This function first try to move instructions from the +/// TargetRegion.BranchTargetBlock down, to the beginning of the +/// SourceRegion.BranchTargetBlock. This is not possible if any register defined +/// in TargetRegion.BranchTargetBlock is used in a PHI node in the +/// SourceRegion.BranchTargetBlock. In this case, check whether the statement +/// can be moved up, to the end of the TargetRegion.BranchBlock (immediately +/// before the branch statement). If it cannot move, then these blocks cannot +/// be merged. +/// +/// Note that there is no analysis for moving instructions past the fall-through +/// blocks because they are confirmed to be empty. An assert is thrown if they +/// are not. +/// +/// \param[in] SourceRegion The candidate to move statements from +/// \param[in] TargetRegion The candidate to move statements to +/// \return true if all instructions in SourceRegion.BranchBlock can be merged +/// into a block in TargetRegion, false otherwise. +/// +bool BranchCoalescing::canMerge(CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion) const { + if (!validateCandidates(SourceRegion, TargetRegion)) + return false; + + // Walk through PHI nodes first and see if they force the merge into the + // SourceRegion.BranchTargetBlock. + for (MachineBasicBlock::iterator + I = SourceRegion.BranchBlock->instr_begin(), + E = SourceRegion.BranchBlock->getFirstNonPHI(); + I != E; ++I) { + for (auto &Def : I->defs()) + for (auto &Use : MRI->use_instructions(Def.getReg())) { + if (Use.isPHI() && Use.getParent() == SourceRegion.BranchTargetBlock) { + DEBUG(dbgs() << "PHI " << *I << " defines register used in another " + "PHI within branch target block -- can't merge\n"); + NumPHINotMoved++; + return false; + } + if (Use.getParent() == SourceRegion.BranchBlock) { + DEBUG(dbgs() << "PHI " << *I + << " defines register used in this " + "block -- all must move down\n"); + SourceRegion.MustMoveDown = true; + } + } + } + + // Walk through the MI to see if they should be merged into + // TargetRegion.BranchBlock (up) or SourceRegion.BranchTargetBlock (down) + for (MachineBasicBlock::iterator + I = SourceRegion.BranchBlock->getFirstNonPHI(), + E = SourceRegion.BranchBlock->end(); + I != E; ++I) { + if (!canMoveToBeginning(*I, *SourceRegion.BranchTargetBlock)) { + DEBUG(dbgs() << "Instruction " << *I + << " cannot move down - must move up!\n"); + SourceRegion.MustMoveUp = true; + } + if (!canMoveToEnd(*I, *TargetRegion.BranchBlock)) { + DEBUG(dbgs() << "Instruction " << *I + << " cannot move up - must move down!\n"); + SourceRegion.MustMoveDown = true; + } + } + + return (SourceRegion.MustMoveUp && SourceRegion.MustMoveDown) ? false : true; +} + +/// Merge the instructions from SourceRegion.BranchBlock, +/// SourceRegion.BranchTargetBlock, and SourceRegion.FallThroughBlock into +/// TargetRegion.BranchBlock, TargetRegion.BranchTargetBlock and +/// TargetRegion.FallThroughBlock respectively. +/// +/// The successors for blocks in TargetRegion will be updated to use the +/// successors from blocks in SourceRegion. Finally, the blocks in SourceRegion +/// will be removed from the function. +/// +/// A region consists of a BranchBlock, a FallThroughBlock, and a +/// BranchTargetBlock. Branch coalesce works on patterns where the +/// TargetRegion's BranchTargetBlock must also be the SourceRegions's +/// BranchBlock. +/// +/// Before mergeCandidates: +/// +/// +---------------------------+ +/// | TargetRegion.BranchBlock | +/// +---------------------------+ +/// / | +/// / +--------------------------------+ +/// | | TargetRegion.FallThroughBlock | +/// \ +--------------------------------+ +/// \ | +/// +----------------------------------+ +/// | TargetRegion.BranchTargetBlock | +/// | SourceRegion.BranchBlock | +/// +----------------------------------+ +/// / | +/// / +--------------------------------+ +/// | | SourceRegion.FallThroughBlock | +/// \ +--------------------------------+ +/// \ | +/// +----------------------------------+ +/// | SourceRegion.BranchTargetBlock | +/// +----------------------------------+ +/// +/// After mergeCandidates: +/// +/// +-----------------------------+ +/// | TargetRegion.BranchBlock | +/// | SourceRegion.BranchBlock | +/// +-----------------------------+ +/// / | +/// / +---------------------------------+ +/// | | TargetRegion.FallThroughBlock | +/// | | SourceRegion.FallThroughBlock | +/// \ +---------------------------------+ +/// \ | +/// +----------------------------------+ +/// | SourceRegion.BranchTargetBlock | +/// +----------------------------------+ +/// +/// \param[in] SourceRegion The candidate to move blocks from +/// \param[in] TargetRegion The candidate to move blocks to +/// +bool BranchCoalescing::mergeCandidates(CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion) { + + if (SourceRegion.MustMoveUp && SourceRegion.MustMoveDown) { + llvm_unreachable("Cannot have both MustMoveDown and MustMoveUp set!"); + return false; + } + + if (!validateCandidates(SourceRegion, TargetRegion)) + return false; + + // Start the merging process by first handling the BranchBlock. + // Move any PHIs in SourceRegion.BranchBlock down to the branch-taken block + moveAndUpdatePHIs(SourceRegion.BranchBlock, SourceRegion.BranchTargetBlock); + + // Move remaining instructions in SourceRegion.BranchBlock into + // TargetRegion.BranchBlock + MachineBasicBlock::iterator firstInstr = + SourceRegion.BranchBlock->getFirstNonPHI(); + MachineBasicBlock::iterator lastInstr = + SourceRegion.BranchBlock->getFirstTerminator(); + + MachineBasicBlock *Source = SourceRegion.MustMoveDown + ? SourceRegion.BranchTargetBlock + : TargetRegion.BranchBlock; + + MachineBasicBlock::iterator Target = + SourceRegion.MustMoveDown + ? SourceRegion.BranchTargetBlock->getFirstNonPHI() + : TargetRegion.BranchBlock->getFirstTerminator(); + + Source->splice(Target, SourceRegion.BranchBlock, firstInstr, lastInstr); + + // Once PHI and instructions have been moved we need to clean up the + // control flow. + + // Remove SourceRegion.FallThroughBlock before transferring successors of + // SourceRegion.BranchBlock to TargetRegion.BranchBlock. + SourceRegion.BranchBlock->removeSuccessor(SourceRegion.FallThroughBlock); + TargetRegion.BranchBlock->transferSuccessorsAndUpdatePHIs( + SourceRegion.BranchBlock); + // Update branch in TargetRegion.BranchBlock to jump to + // SourceRegion.BranchTargetBlock + // In this case, TargetRegion.BranchTargetBlock == SourceRegion.BranchBlock. + TargetRegion.BranchBlock->ReplaceUsesOfBlockWith( + SourceRegion.BranchBlock, SourceRegion.BranchTargetBlock); + // Remove the branch statement(s) in SourceRegion.BranchBlock + MachineBasicBlock::iterator I = + SourceRegion.BranchBlock->terminators().begin(); + while (I != SourceRegion.BranchBlock->terminators().end()) { + MachineInstr &CurrInst = *I; + ++I; + if (CurrInst.isBranch()) + CurrInst.eraseFromParent(); + } + + // Fall-through block should be empty since this is part of the condition + // to coalesce the branches. + assert(TargetRegion.FallThroughBlock->empty() && + "FallThroughBlocks should be empty!"); + + // Transfer successor information and move PHIs down to the + // branch-taken block. + TargetRegion.FallThroughBlock->transferSuccessorsAndUpdatePHIs( + SourceRegion.FallThroughBlock); + TargetRegion.FallThroughBlock->removeSuccessor(SourceRegion.BranchBlock); + + // Remove the blocks from the function. + assert(SourceRegion.BranchBlock->empty() && + "Expecting branch block to be empty!"); + SourceRegion.BranchBlock->eraseFromParent(); + + assert(SourceRegion.FallThroughBlock->empty() && + "Expecting fall-through block to be empty!\n"); + SourceRegion.FallThroughBlock->eraseFromParent(); + + NumBlocksCoalesced++; + return true; +} + +bool BranchCoalescing::runOnMachineFunction(MachineFunction &MF) { + + if (skipFunction(*MF.getFunction()) || MF.empty() || + !isBranchCoalescingEnabled()) + return false; + + bool didSomething = false; + + DEBUG(dbgs() << "******** Branch Coalescing ********\n"); + initialize(MF); + + DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n"); + + CoalescingCandidateInfo Cand1, Cand2; + // Walk over blocks and find candidates to merge + // Continue trying to merge with the first candidate found, as long as merging + // is successfull. + for (MachineBasicBlock &MBB : MF) { + bool MergedCandidates = false; + do { + MergedCandidates = false; + Cand1.clear(); + Cand2.clear(); + + Cand1.BranchBlock = &MBB; + + // If unable to coalesce the branch, then continue to next block + if (!canCoalesceBranch(Cand1)) + break; + + Cand2.BranchBlock = Cand1.BranchTargetBlock; + if (!canCoalesceBranch(Cand2)) + break; + + // Sanity check + // The branch-taken block of the second candidate should post-dominate the + // first candidate + assert(MPDT->dominates(Cand2.BranchTargetBlock, Cand1.BranchBlock) && + "Branch-taken block should post-dominate first candidate"); + + if (!identicalOperands(Cand1.Cond, Cand2.Cond)) { + DEBUG(dbgs() << "Blocks " << Cand1.BranchBlock->getNumber() << " and " + << Cand2.BranchBlock->getNumber() + << " have different branches\n"); + break; + } + if (!canMerge(Cand2, Cand1)) { + DEBUG(dbgs() << "Cannot merge blocks " << Cand1.BranchBlock->getNumber() + << " and " << Cand2.BranchBlock->getNumber() << "\n"); + NumBlocksNotCoalesced++; + continue; + } + DEBUG(dbgs() << "Merging blocks " << Cand1.BranchBlock->getNumber() + << " and " << Cand1.BranchTargetBlock->getNumber() << "\n"); + MergedCandidates = mergeCandidates(Cand2, Cand1); + if (MergedCandidates) + didSomething = true; + + DEBUG(dbgs() << "Function after merging: "; MF.dump(); dbgs() << "\n"); + } while (MergedCandidates); + } + +#ifndef NDEBUG + // Verify MF is still valid after branch coalescing + if (didSomething) + MF.verify(nullptr, "Error in code produced by branch coalescing"); +#endif // NDEBUG + + DEBUG(dbgs() << "Finished Branch Coalescing\n"); + return didSomething; +} diff --git a/contrib/llvm/lib/CodeGen/BranchFolding.cpp b/contrib/llvm/lib/CodeGen/BranchFolding.cpp index 6fba161..3c439e6 100644 --- a/contrib/llvm/lib/CodeGen/BranchFolding.cpp +++ b/contrib/llvm/lib/CodeGen/BranchFolding.cpp @@ -1,4 +1,4 @@ -//===-- BranchFolding.cpp - Fold machine code branch instructions ---------===// +//===- BranchFolding.cpp - Fold machine code branch instructions ----------===// // // The LLVM Compiler Infrastructure // @@ -18,37 +18,55 @@ //===----------------------------------------------------------------------===// #include "BranchFolding.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" -#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/Function.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" +#include "llvm/Support/BlockFrequency.h" +#include "llvm/Support/BranchProbability.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" -#include <algorithm> +#include <cassert> +#include <cstddef> +#include <iterator> +#include <numeric> +#include <vector> + using namespace llvm; -#define DEBUG_TYPE "branchfolding" +#define DEBUG_TYPE "branch-folder" STATISTIC(NumDeadBlocks, "Number of dead blocks removed"); STATISTIC(NumBranchOpts, "Number of branches optimized"); STATISTIC(NumTailMerge , "Number of block tails merged"); STATISTIC(NumHoist , "Number of times common instructions are hoisted"); +STATISTIC(NumTailCalls, "Number of tail calls optimized"); static cl::opt<cl::boolOrDefault> FlagEnableTailMerge("enable-tail-merge", cl::init(cl::BOU_UNSET), cl::Hidden); @@ -67,10 +85,12 @@ TailMergeSize("tail-merge-size", cl::init(3), cl::Hidden); namespace { + /// BranchFolderPass - Wrap branch folder in a machine function pass. class BranchFolderPass : public MachineFunctionPass { public: static char ID; + explicit BranchFolderPass(): MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -82,12 +102,13 @@ namespace { MachineFunctionPass::getAnalysisUsage(AU); } }; -} + +} // end anonymous namespace char BranchFolderPass::ID = 0; char &llvm::BranchFolderPassID = BranchFolderPass::ID; -INITIALIZE_PASS(BranchFolderPass, "branch-folder", +INITIALIZE_PASS(BranchFolderPass, DEBUG_TYPE, "Control Flow Optimizer", false, false) bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) { @@ -123,8 +144,6 @@ BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist, } } -/// RemoveDeadBlock - Remove the specified dead machine basic block from the -/// function, updating the CFG. void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) { assert(MBB->pred_empty() && "MBB must be dead!"); DEBUG(dbgs() << "\nRemoving MBB: " << *MBB); @@ -144,9 +163,6 @@ void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) { MLI->removeBlock(MBB); } -/// OptimizeFunction - Perhaps branch folding, tail merging and other -/// CFG optimizations on the given function. Block placement changes the layout -/// and may create new tail merging opportunities. bool BranchFolder::OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii, const TargetRegisterInfo *tri, @@ -156,13 +172,14 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF, TriedMerging.clear(); + MachineRegisterInfo &MRI = MF.getRegInfo(); AfterBlockPlacement = AfterPlacement; TII = tii; TRI = tri; MMI = mmi; MLI = mli; + this->MRI = &MRI; - MachineRegisterInfo &MRI = MF.getRegInfo(); UpdateLiveIns = MRI.tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF); if (!UpdateLiveIns) MRI.invalidateLiveness(); @@ -348,23 +365,18 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1, return TailLen; } -/// ReplaceTailWithBranchTo - Delete the instruction OldInst and everything -/// after it, replacing it with an unconditional branch to NewDest. void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst, MachineBasicBlock *NewDest) { TII->ReplaceTailWithBranchTo(OldInst, NewDest); if (UpdateLiveIns) { NewDest->clearLiveIns(); - computeLiveIns(LiveRegs, *TRI, *NewDest); + computeLiveIns(LiveRegs, *MRI, *NewDest); } ++NumTailMerge; } -/// SplitMBBAt - Given a machine basic block and an iterator into it, split the -/// MBB so that the part before the iterator falls into the part starting at the -/// iterator. This returns the new MBB. MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB, MachineBasicBlock::iterator BBI1, const BasicBlock *BB) { @@ -375,7 +387,7 @@ MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB, // Create the fall-through block. MachineFunction::iterator MBBI = CurMBB.getIterator(); - MachineBasicBlock *NewMBB =MF.CreateMachineBasicBlock(BB); + MachineBasicBlock *NewMBB = MF.CreateMachineBasicBlock(BB); CurMBB.getParent()->insert(++MBBI, NewMBB); // Move all the successors of this block to the specified block. @@ -388,7 +400,7 @@ MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB, NewMBB->splice(NewMBB->end(), &CurMBB, BBI1, CurMBB.end()); // NewMBB belongs to the same loop as CurMBB. - if (MLI) + if (MLI) if (MachineLoop *ML = MLI->getLoopFor(&CurMBB)) ML->addBasicBlockToLoop(NewMBB, MLI->getBase()); @@ -396,7 +408,7 @@ MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB, MBBFreqInfo.setBlockFreq(NewMBB, MBBFreqInfo.getBlockFreq(&CurMBB)); if (UpdateLiveIns) - computeLiveIns(LiveRegs, *TRI, *NewMBB); + computeLiveIns(LiveRegs, *MRI, *NewMBB); // Add the new block to the funclet. const auto &FuncletI = FuncletMembership.find(&CurMBB); @@ -436,7 +448,7 @@ static void FixTail(MachineBasicBlock *CurMBB, MachineBasicBlock *SuccBB, MachineFunction::iterator I = std::next(MachineFunction::iterator(CurMBB)); MachineBasicBlock *TBB = nullptr, *FBB = nullptr; SmallVector<MachineOperand, 4> Cond; - DebugLoc dl; // FIXME: this is nowhere + DebugLoc dl = CurMBB->findBranchDebugLoc(); if (I != MF->end() && !TII->analyzeBranch(*CurMBB, TBB, FBB, Cond, true)) { MachineBasicBlock *NextBB = &*I; if (TBB == NextBB && !Cond.empty() && !FBB) { @@ -497,6 +509,15 @@ BranchFolder::MBFIWrapper::printBlockFreq(raw_ostream &OS, return MBFI.printBlockFreq(OS, Freq); } +void BranchFolder::MBFIWrapper::view(const Twine &Name, bool isSimple) { + MBFI.view(Name, isSimple); +} + +uint64_t +BranchFolder::MBFIWrapper::getEntryFreq() const { + return MBFI.getEntryFreq(); +} + /// CountTerminators - Count the number of terminators in the given /// block and set I to the position of the first non-terminator, if there /// is one, or MBB->end() otherwise. @@ -504,7 +525,7 @@ static unsigned CountTerminators(MachineBasicBlock *MBB, MachineBasicBlock::iterator &I) { I = MBB->end(); unsigned NumTerms = 0; - for (;;) { + while (true) { if (I == MBB->begin()) { I = MBB->end(); break; @@ -516,6 +537,17 @@ static unsigned CountTerminators(MachineBasicBlock *MBB, return NumTerms; } +/// A no successor, non-return block probably ends in unreachable and is cold. +/// Also consider a block that ends in an indirect branch to be a return block, +/// since many targets use plain indirect branches to return. +static bool blockEndsInUnreachable(const MachineBasicBlock *MBB) { + if (!MBB->succ_empty()) + return false; + if (MBB->empty()) + return true; + return !(MBB->back().isReturn() || MBB->back().isIndirectBranch()); +} + /// ProfitableToMerge - Check if two machine basic blocks have a common tail /// and decide if it would be profitable to merge those tails. Return the /// length of the common tail and iterators to the first common instruction @@ -570,6 +602,15 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2, return true; } + // If these are identical non-return blocks with no successors, merge them. + // Such blocks are typically cold calls to noreturn functions like abort, and + // are unlikely to become a fallthrough target after machine block placement. + // Tail merging these blocks is unlikely to create additional unconditional + // branches, and will reduce the size of this cold code. + if (I1 == MBB1->begin() && I2 == MBB2->begin() && + blockEndsInUnreachable(MBB1) && blockEndsInUnreachable(MBB2)) + return true; + // If one of the blocks can be completely merged and happens to be in // a position where the other could fall through into it, merge any number // of instructions, because it can be done without a branch. @@ -579,6 +620,22 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2, if (MBB2->isLayoutSuccessor(MBB1) && I1 == MBB1->begin()) return true; + // If both blocks are identical and end in a branch, merge them unless they + // both have a fallthrough predecessor and successor. + // We can only do this after block placement because it depends on whether + // there are fallthroughs, and we don't know until after layout. + if (AfterPlacement && I1 == MBB1->begin() && I2 == MBB2->begin()) { + auto BothFallThrough = [](MachineBasicBlock *MBB) { + if (MBB->succ_size() != 0 && !MBB->canFallThrough()) + return false; + MachineFunction::iterator I(MBB); + MachineFunction *MF = MBB->getParent(); + return (MBB != &*MF->begin()) && std::prev(I)->canFallThrough(); + }; + if (!BothFallThrough(MBB1) || !BothFallThrough(MBB2)) + return true; + } + // If both blocks have an unconditional branch temporarily stripped out, // count that as an additional common instruction for the following // heuristics. This heuristic is only accurate for single-succ blocks, so to @@ -604,16 +661,6 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2, (I1 == MBB1->begin() || I2 == MBB2->begin()); } -/// ComputeSameTails - Look through all the blocks in MergePotentials that have -/// hash CurHash (guaranteed to match the last element). Build the vector -/// SameTails of all those that have the (same) largest number of instructions -/// in common of any pair of these blocks. SameTails entries contain an -/// iterator into MergePotentials (from which the MachineBasicBlock can be -/// found) and a MachineBasicBlock::iterator into that MBB indicating the -/// instruction where the matching code sequence begins. -/// Order of elements in SameTails is the reverse of the order in which -/// those blocks appear in MergePotentials (where they are not necessarily -/// consecutive). unsigned BranchFolder::ComputeSameTails(unsigned CurHash, unsigned MinCommonTailLength, MachineBasicBlock *SuccBB, @@ -650,8 +697,6 @@ unsigned BranchFolder::ComputeSameTails(unsigned CurHash, return maxCommonTailLength; } -/// RemoveBlocksWithHash - Remove all blocks with hash CurHash from -/// MergePotentials, restoring branches at ends of blocks as appropriate. void BranchFolder::RemoveBlocksWithHash(unsigned CurHash, MachineBasicBlock *SuccBB, MachineBasicBlock *PredBB) { @@ -671,8 +716,6 @@ void BranchFolder::RemoveBlocksWithHash(unsigned CurHash, MergePotentials.erase(CurMPIter, MergePotentials.end()); } -/// CreateCommonTailOnlyBlock - None of the blocks to be tail-merged consist -/// only of the common tail. Create a block that does by splitting one. bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB, MachineBasicBlock *SuccBB, unsigned maxCommonTailLength, @@ -723,6 +766,43 @@ bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB, return true; } +void BranchFolder::MergeCommonTailDebugLocs(unsigned commonTailIndex) { + MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock(); + + std::vector<MachineBasicBlock::iterator> NextCommonInsts(SameTails.size()); + for (unsigned int i = 0 ; i != SameTails.size() ; ++i) { + if (i != commonTailIndex) + NextCommonInsts[i] = SameTails[i].getTailStartPos(); + else { + assert(SameTails[i].getTailStartPos() == MBB->begin() && + "MBB is not a common tail only block"); + } + } + + for (auto &MI : *MBB) { + if (MI.isDebugValue()) + continue; + DebugLoc DL = MI.getDebugLoc(); + for (unsigned int i = 0 ; i < NextCommonInsts.size() ; i++) { + if (i == commonTailIndex) + continue; + + auto &Pos = NextCommonInsts[i]; + assert(Pos != SameTails[i].getBlock()->end() && + "Reached BB end within common tail"); + while (Pos->isDebugValue()) { + ++Pos; + assert(Pos != SameTails[i].getBlock()->end() && + "Reached BB end within common tail"); + } + assert(MI.isIdenticalTo(*Pos) && "Expected matching MIIs!"); + DL = DILocation::getMergedLocation(DL, Pos->getDebugLoc()); + NextCommonInsts[i] = ++Pos; + } + MI.setDebugLoc(DL); + } +} + static void mergeOperations(MachineBasicBlock::iterator MBBIStartPos, MachineBasicBlock &MBBCommon) { @@ -875,10 +955,8 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB, // Recompute common tail MBB's edge weights and block frequency. setCommonTailEdgeWeights(*MBB); - // Remove the original debug location from the common tail. - for (auto &MI : *MBB) - if (!MI.isDebugValue()) - MI.setDebugLoc(DebugLoc()); + // Merge debug locations across identical instructions for common tail. + MergeCommonTailDebugLocs(commonTailIndex); // MBB is common tail. Adjust all other BB's to jump to this one. // Traversal must be forwards so erases work. @@ -1043,7 +1121,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) { // Remove the unconditional branch at the end, if any. if (TBB && (Cond.empty() || FBB)) { - DebugLoc dl; // FIXME: this is nowhere + DebugLoc dl = PBB->findBranchDebugLoc(); TII->removeBranch(*PBB); if (!Cond.empty()) // reinsert conditional branch only, for now @@ -1193,8 +1271,6 @@ static DebugLoc getBranchDebugLoc(MachineBasicBlock &MBB) { return DebugLoc(); } -/// OptimizeBlock - Analyze and optimize control flow related to the specified -/// block. This is never called on the entry block. bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) { bool MadeChange = false; MachineFunction &MF = *MBB->getParent(); @@ -1386,6 +1462,43 @@ ReoptimizeBlock: } } + if (!IsEmptyBlock(MBB) && MBB->pred_size() == 1 && + MF.getFunction()->optForSize()) { + // Changing "Jcc foo; foo: jmp bar;" into "Jcc bar;" might change the branch + // direction, thereby defeating careful block placement and regressing + // performance. Therefore, only consider this for optsize functions. + MachineInstr &TailCall = *MBB->getFirstNonDebugInstr(); + if (TII->isUnconditionalTailCall(TailCall)) { + MachineBasicBlock *Pred = *MBB->pred_begin(); + MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr; + SmallVector<MachineOperand, 4> PredCond; + bool PredAnalyzable = + !TII->analyzeBranch(*Pred, PredTBB, PredFBB, PredCond, true); + + if (PredAnalyzable && !PredCond.empty() && PredTBB == MBB && + PredTBB != PredFBB) { + // The predecessor has a conditional branch to this block which consists + // of only a tail call. Try to fold the tail call into the conditional + // branch. + if (TII->canMakeTailCallConditional(PredCond, TailCall)) { + // TODO: It would be nice if analyzeBranch() could provide a pointer + // to the branch instruction so replaceBranchWithTailCall() doesn't + // have to search for it. + TII->replaceBranchWithTailCall(*Pred, PredCond, TailCall); + ++NumTailCalls; + Pred->removeSuccessor(MBB); + MadeChange = true; + return MadeChange; + } + } + // If the predecessor is falling through to this block, we could reverse + // the branch condition and fold the tail call into that. However, after + // that we might have to re-arrange the CFG to fall through to the other + // block and there is a high risk of regressing code size rather than + // improving it. + } + } + // Analyze the branch in the current block. MachineBasicBlock *CurTBB = nullptr, *CurFBB = nullptr; SmallVector<MachineOperand, 4> CurCond; @@ -1508,7 +1621,6 @@ ReoptimizeBlock: // block doesn't fall through into some other block, see if we can find a // place to move this block where a fall-through will happen. if (!PrevBB.canFallThrough()) { - // Now we know that there was no fall-through into this block, check to // see if it has a fall-through into its successor. bool CurFallsThru = MBB->canFallThrough(); @@ -1599,8 +1711,6 @@ ReoptimizeBlock: // Hoist Common Code //===----------------------------------------------------------------------===// -/// HoistCommonCode - Hoist common instruction sequences at the start of basic -/// blocks to their common predecessor. bool BranchFolder::HoistCommonCode(MachineFunction &MF) { bool MadeChange = false; for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ) { @@ -1734,9 +1844,6 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB, return PI; } -/// HoistCommonCodeInSuccs - If the successors of MBB has common instruction -/// sequence at the start of the function, move the instructions before MBB -/// terminator if it's legal. bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { MachineBasicBlock *TBB = nullptr, *FBB = nullptr; SmallVector<MachineOperand, 4> Cond; @@ -1763,8 +1870,8 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { return false; bool HasDups = false; - SmallVector<unsigned, 4> LocalDefs; - SmallSet<unsigned, 4> LocalDefsSet; + SmallVector<unsigned, 4> LocalDefs, LocalKills; + SmallSet<unsigned, 4> ActiveDefsSet, AllDefsSet; MachineBasicBlock::iterator TIB = TBB->begin(); MachineBasicBlock::iterator FIB = FBB->begin(); MachineBasicBlock::iterator TIE = TBB->end(); @@ -1818,7 +1925,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { IsSafe = false; break; } - } else if (!LocalDefsSet.count(Reg)) { + } else if (!ActiveDefsSet.count(Reg)) { if (Defs.count(Reg)) { // Use is defined by the instruction at the point of insertion. IsSafe = false; @@ -1838,18 +1945,22 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { if (!TIB->isSafeToMove(nullptr, DontMoveAcrossStore)) break; - // Remove kills from LocalDefsSet, these registers had short live ranges. + // Remove kills from ActiveDefsSet, these registers had short live ranges. for (const MachineOperand &MO : TIB->operands()) { if (!MO.isReg() || !MO.isUse() || !MO.isKill()) continue; unsigned Reg = MO.getReg(); - if (!Reg || !LocalDefsSet.count(Reg)) + if (!Reg) continue; + if (!AllDefsSet.count(Reg)) { + LocalKills.push_back(Reg); + continue; + } if (TargetRegisterInfo::isPhysicalRegister(Reg)) { for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) - LocalDefsSet.erase(*AI); + ActiveDefsSet.erase(*AI); } else { - LocalDefsSet.erase(Reg); + ActiveDefsSet.erase(Reg); } } @@ -1861,7 +1972,8 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg)) continue; LocalDefs.push_back(Reg); - addRegAndItsAliases(Reg, TRI, LocalDefsSet); + addRegAndItsAliases(Reg, TRI, ActiveDefsSet); + addRegAndItsAliases(Reg, TRI, AllDefsSet); } HasDups = true; @@ -1876,17 +1988,22 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { FBB->erase(FBB->begin(), FIB); // Update livein's. - bool AddedLiveIns = false; + bool ChangedLiveIns = false; for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) { unsigned Def = LocalDefs[i]; - if (LocalDefsSet.count(Def)) { + if (ActiveDefsSet.count(Def)) { TBB->addLiveIn(Def); FBB->addLiveIn(Def); - AddedLiveIns = true; + ChangedLiveIns = true; } } + for (unsigned K : LocalKills) { + TBB->removeLiveIn(K); + FBB->removeLiveIn(K); + ChangedLiveIns = true; + } - if (AddedLiveIns) { + if (ChangedLiveIns) { TBB->sortUniqueLiveIns(); FBB->sortUniqueLiveIns(); } diff --git a/contrib/llvm/lib/CodeGen/BranchFolding.h b/contrib/llvm/lib/CodeGen/BranchFolding.h index fc48e48..9268113 100644 --- a/contrib/llvm/lib/CodeGen/BranchFolding.h +++ b/contrib/llvm/lib/CodeGen/BranchFolding.h @@ -37,6 +37,9 @@ namespace llvm { // flag. Ignored for optsize. unsigned MinCommonTailLength = 0); + /// Perhaps branch folding, tail merging and other CFG optimizations on the + /// given function. Block placement changes the layout and may create new + /// tail merging opportunities. bool OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii, const TargetRegisterInfo *tri, MachineModuleInfo *mmi, MachineLoopInfo *mli = nullptr, @@ -105,6 +108,7 @@ namespace llvm { bool UpdateLiveIns; unsigned MinCommonTailLength; const TargetInstrInfo *TII; + const MachineRegisterInfo *MRI; const TargetRegisterInfo *TRI; MachineModuleInfo *MMI; MachineLoopInfo *MLI; @@ -122,6 +126,8 @@ namespace llvm { const MachineBasicBlock *MBB) const; raw_ostream &printBlockFreq(raw_ostream &OS, const BlockFrequency Freq) const; + void view(const Twine &Name, bool isSimple = true); + uint64_t getEntryFreq() const; private: const MachineBlockFrequencyInfo &MBFI; @@ -137,26 +143,64 @@ namespace llvm { MachineBasicBlock* PredBB, unsigned MinCommonTailLength); void setCommonTailEdgeWeights(MachineBasicBlock &TailMBB); + + /// Delete the instruction OldInst and everything after it, replacing it + /// with an unconditional branch to NewDest. void ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst, MachineBasicBlock *NewDest); + + /// Given a machine basic block and an iterator into it, split the MBB so + /// that the part before the iterator falls into the part starting at the + /// iterator. This returns the new MBB. MachineBasicBlock *SplitMBBAt(MachineBasicBlock &CurMBB, MachineBasicBlock::iterator BBI1, const BasicBlock *BB); + + /// Look through all the blocks in MergePotentials that have hash CurHash + /// (guaranteed to match the last element). Build the vector SameTails of + /// all those that have the (same) largest number of instructions in common + /// of any pair of these blocks. SameTails entries contain an iterator into + /// MergePotentials (from which the MachineBasicBlock can be found) and a + /// MachineBasicBlock::iterator into that MBB indicating the instruction + /// where the matching code sequence begins. Order of elements in SameTails + /// is the reverse of the order in which those blocks appear in + /// MergePotentials (where they are not necessarily consecutive). unsigned ComputeSameTails(unsigned CurHash, unsigned minCommonTailLength, MachineBasicBlock *SuccBB, MachineBasicBlock *PredBB); + + /// Remove all blocks with hash CurHash from MergePotentials, restoring + /// branches at ends of blocks as appropriate. void RemoveBlocksWithHash(unsigned CurHash, MachineBasicBlock* SuccBB, MachineBasicBlock* PredBB); + + /// None of the blocks to be tail-merged consist only of the common tail. + /// Create a block that does by splitting one. bool CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB, MachineBasicBlock *SuccBB, unsigned maxCommonTailLength, unsigned &commonTailIndex); + /// Create merged DebugLocs of identical instructions across SameTails and + /// assign it to the instruction in common tail. + void MergeCommonTailDebugLocs(unsigned commonTailIndex); + bool OptimizeBranches(MachineFunction &MF); + + /// Analyze and optimize control flow related to the specified block. This + /// is never called on the entry block. bool OptimizeBlock(MachineBasicBlock *MBB); + + /// Remove the specified dead machine basic block from the function, + /// updating the CFG. void RemoveDeadBlock(MachineBasicBlock *MBB); + /// Hoist common instruction sequences at the start of basic blocks to their + /// common predecessor. bool HoistCommonCode(MachineFunction &MF); + + /// If the successors of MBB has common instruction sequence at the start of + /// the function, move the instructions before MBB terminator if it's legal. bool HoistCommonCodeInSuccs(MachineBasicBlock *MBB); }; } diff --git a/contrib/llvm/lib/CodeGen/BranchRelaxation.cpp b/contrib/llvm/lib/CodeGen/BranchRelaxation.cpp index 8b27570..27ee12c 100644 --- a/contrib/llvm/lib/CodeGen/BranchRelaxation.cpp +++ b/contrib/llvm/lib/CodeGen/BranchRelaxation.cpp @@ -7,17 +7,17 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegisterScavenging.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetSubtargetInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; @@ -126,14 +126,16 @@ void BranchRelaxation::verify() { #endif } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// print block size and offset information - debugging -void BranchRelaxation::dumpBBs() { +LLVM_DUMP_METHOD void BranchRelaxation::dumpBBs() { for (auto &MBB : *MF) { const BasicBlockInfo &BBI = BlockInfo[MBB.getNumber()]; dbgs() << format("BB#%u\toffset=%08x\t", MBB.getNumber(), BBI.Offset) << format("size=%#x\n", BBI.Size); } } +#endif /// scanFunction - Do the initial scan of the function, building up /// information about each block. @@ -257,7 +259,7 @@ MachineBasicBlock *BranchRelaxation::splitBlockBeforeInstr(MachineInstr &MI, // Need to fix live-in lists if we track liveness. if (TRI->trackLivenessAfterRegAlloc(*MF)) - computeLiveIns(LiveRegs, *TRI, *NewBB); + computeLiveIns(LiveRegs, MF->getRegInfo(), *NewBB); ++NumSplit; @@ -343,6 +345,10 @@ bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) { // Do it here since if there's no split, no update is needed. MBB->replaceSuccessor(FBB, &NewBB); NewBB.addSuccessor(FBB); + + // Need to fix live-in lists if we track liveness. + if (TRI->trackLivenessAfterRegAlloc(*MF)) + computeLiveIns(LiveRegs, MF->getRegInfo(), NewBB); } // We now have an appropriate fall-through block in place (either naturally or diff --git a/contrib/llvm/lib/CodeGen/BuiltinGCs.cpp b/contrib/llvm/lib/CodeGen/BuiltinGCs.cpp index ff7c99d..abac555 100644 --- a/contrib/llvm/lib/CodeGen/BuiltinGCs.cpp +++ b/contrib/llvm/lib/CodeGen/BuiltinGCs.cpp @@ -1,4 +1,4 @@ -//===-- BuiltinGCs.cpp - Boilerplate for our built in GC types --*- C++ -*-===// +//===- BuiltinGCs.cpp - Boilerplate for our built in GC types -------------===// // // The LLVM Compiler Infrastructure // @@ -12,8 +12,10 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/GCs.h" #include "llvm/CodeGen/GCStrategy.h" +#include "llvm/CodeGen/GCs.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/Support/Casting.h" using namespace llvm; @@ -77,6 +79,7 @@ public: UsesMetadata = false; CustomRoots = false; } + Optional<bool> isGCManagedPointer(const Type *Ty) const override { // Method is only valid on pointer typed values. const PointerType *PT = cast<PointerType>(Ty); @@ -110,6 +113,7 @@ public: UsesMetadata = false; CustomRoots = false; } + Optional<bool> isGCManagedPointer(const Type *Ty) const override { // Method is only valid on pointer typed values. const PointerType *PT = cast<PointerType>(Ty); @@ -117,7 +121,8 @@ public: return (1 == PT->getAddressSpace()); } }; -} + +} // end anonymous namespace // Register all the above so that they can be found at runtime. Note that // these static initializers are important since the registration list is diff --git a/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp b/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp index dc2d38a..c2ced19 100644 --- a/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp +++ b/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp @@ -7,13 +7,13 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/VirtRegMap.h" #include "llvm/CodeGen/CalcSpillWeights.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/VirtRegMap.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" diff --git a/contrib/llvm/lib/CodeGen/CallingConvLower.cpp b/contrib/llvm/lib/CodeGen/CallingConvLower.cpp index 2e33f14..7cad4d0 100644 --- a/contrib/llvm/lib/CodeGen/CallingConvLower.cpp +++ b/contrib/llvm/lib/CodeGen/CallingConvLower.cpp @@ -30,8 +30,7 @@ using namespace llvm; CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf, SmallVectorImpl<CCValAssign> &locs, LLVMContext &C) : CallingConv(CC), IsVarArg(isVarArg), MF(mf), - TRI(*MF.getSubtarget().getRegisterInfo()), Locs(locs), Context(C), - CallOrPrologue(Unknown) { + TRI(*MF.getSubtarget().getRegisterInfo()), Locs(locs), Context(C) { // No stack is used. StackOffset = 0; MaxStackArgAlign = 1; diff --git a/contrib/llvm/lib/CodeGen/CodeGen.cpp b/contrib/llvm/lib/CodeGen/CodeGen.cpp index 4cf9b13..b7fd45a 100644 --- a/contrib/llvm/lib/CodeGen/CodeGen.cpp +++ b/contrib/llvm/lib/CodeGen/CodeGen.cpp @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#include "llvm/InitializePasses.h" #include "llvm-c/Initialization.h" +#include "llvm/InitializePasses.h" #include "llvm/PassRegistry.h" using namespace llvm; @@ -21,6 +21,7 @@ using namespace llvm; /// initializeCodeGen - Initialize all passes linked into the CodeGen library. void llvm::initializeCodeGen(PassRegistry &Registry) { initializeAtomicExpandPass(Registry); + initializeBranchCoalescingPass(Registry); initializeBranchFolderPassPass(Registry); initializeBranchRelaxationPass(Registry); initializeCodeGenPreparePass(Registry); @@ -31,14 +32,18 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeEarlyIfConverterPass(Registry); initializeExpandISelPseudosPass(Registry); initializeExpandPostRAPass(Registry); + initializeFEntryInserterPass(Registry); initializeFinalizeMachineBundlesPass(Registry); initializeFuncletLayoutPass(Registry); initializeGCMachineCodeAnalysisPass(Registry); initializeGCModuleInfoPass(Registry); initializeIfConverterPass(Registry); + initializeImplicitNullChecksPass(Registry); initializeInterleavedAccessPass(Registry); + initializeLiveDebugValuesPass(Registry); initializeLiveDebugVariablesPass(Registry); initializeLiveIntervalsPass(Registry); + initializeLiveRangeShrinkPass(Registry); initializeLiveStacksPass(Registry); initializeLiveVariablesPass(Registry); initializeLocalStackSlotPassPass(Registry); @@ -47,7 +52,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeMachineBlockPlacementPass(Registry); initializeMachineBlockPlacementStatsPass(Registry); initializeMachineCSEPass(Registry); - initializeImplicitNullChecksPass(Registry); initializeMachineCombinerPass(Registry); initializeMachineCopyPropagationPass(Registry); initializeMachineDominatorTreePass(Registry); @@ -55,31 +59,35 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeMachineLICMPass(Registry); initializeMachineLoopInfoPass(Registry); initializeMachineModuleInfoPass(Registry); + initializeMachineOptimizationRemarkEmitterPassPass(Registry); + initializeMachineOutlinerPass(Registry); initializeMachinePipelinerPass(Registry); initializeMachinePostDominatorTreePass(Registry); + initializeMachineRegionInfoPassPass(Registry); initializeMachineSchedulerPass(Registry); initializeMachineSinkingPass(Registry); initializeMachineVerifierPassPass(Registry); - initializeXRayInstrumentationPass(Registry); - initializePatchableFunctionPass(Registry); initializeOptimizePHIsPass(Registry); initializePEIPass(Registry); initializePHIEliminationPass(Registry); + initializePatchableFunctionPass(Registry); initializePeepholeOptimizerPass(Registry); initializePostMachineSchedulerPass(Registry); initializePostRAHazardRecognizerPass(Registry); initializePostRASchedulerPass(Registry); initializePreISelIntrinsicLoweringLegacyPassPass(Registry); initializeProcessImplicitDefsPass(Registry); + initializeRABasicPass(Registry); + initializeRAFastPass(Registry); initializeRAGreedyPass(Registry); initializeRegisterCoalescerPass(Registry); initializeRenameIndependentSubregsPass(Registry); + initializeSafeStackLegacyPassPass(Registry); + initializeScalarizeMaskedMemIntrinPass(Registry); initializeShrinkWrapPass(Registry); initializeSlotIndexesPass(Registry); initializeStackColoringPass(Registry); initializeStackMapLivenessPass(Registry); - initializeLiveDebugValuesPass(Registry); - initializeSafeStackPass(Registry); initializeStackProtectorPass(Registry); initializeStackSlotColoringPass(Registry); initializeTailDuplicatePassPass(Registry); @@ -91,6 +99,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeVirtRegMapPass(Registry); initializeVirtRegRewriterPass(Registry); initializeWinEHPreparePass(Registry); + initializeXRayInstrumentationPass(Registry); } void LLVMInitializeCodeGen(LLVMPassRegistryRef R) { diff --git a/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp b/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp index 934b470..dc02a00 100644 --- a/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -13,20 +13,23 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -53,8 +56,11 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/BypassSlowDivision.h" +#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SimplifyLibCalls.h" +#include "llvm/Transforms/Utils/ValueMapper.h" + using namespace llvm; using namespace llvm::PatternMatch; @@ -77,9 +83,14 @@ STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized"); STATISTIC(NumRetsDup, "Number of return instructions duplicated"); STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); -STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches"); STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed"); +STATISTIC(NumMemCmpCalls, "Number of memcmp calls"); +STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size"); +STATISTIC(NumMemCmpGreaterThanMax, + "Number of memcmp calls with size greater than max size"); +STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls"); + static cl::opt<bool> DisableBranchOpts( "disable-cgp-branch-opts", cl::Hidden, cl::init(false), cl::desc("Disable branch optimizations in CodeGenPrepare")); @@ -93,7 +104,7 @@ static cl::opt<bool> DisableSelectToBranch( cl::desc("Disable select to branch conversion.")); static cl::opt<bool> AddrSinkUsingGEPs( - "addr-sink-using-gep", cl::Hidden, cl::init(false), + "addr-sink-using-gep", cl::Hidden, cl::init(true), cl::desc("Address sinking in CGP using GEPs.")); static cl::opt<bool> EnableAndCmpSinking( @@ -123,7 +134,7 @@ static cl::opt<bool> DisablePreheaderProtect( cl::desc("Disable protection against removing loop preheaders")); static cl::opt<bool> ProfileGuidedSectionPrefix( - "profile-guided-section-prefix", cl::Hidden, cl::init(true), + "profile-guided-section-prefix", cl::Hidden, cl::init(true), cl::ZeroOrMore, cl::desc("Use profile info to add section prefix for hot/cold functions")); static cl::opt<unsigned> FreqRatioToSkipMerge( @@ -135,15 +146,29 @@ static cl::opt<bool> ForceSplitStore( "force-split-store", cl::Hidden, cl::init(false), cl::desc("Force store splitting no matter what the target query says.")); +static cl::opt<bool> +EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden, + cl::desc("Enable merging of redundant sexts when one is dominating" + " the other."), cl::init(true)); + +static cl::opt<unsigned> MemCmpNumLoadsPerBlock( + "memcmp-num-loads-per-block", cl::Hidden, cl::init(1), + cl::desc("The number of loads per basic block for inline expansion of " + "memcmp that is only being compared against zero.")); + namespace { typedef SmallPtrSet<Instruction *, 16> SetOfInstrs; typedef PointerIntPair<Type *, 1, bool> TypeIsSExt; typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy; +typedef SmallVector<Instruction *, 16> SExts; +typedef DenseMap<Value *, SExts> ValueToSExts; class TypePromotionTransaction; class CodeGenPrepare : public FunctionPass { const TargetMachine *TM; + const TargetSubtargetInfo *SubtargetInfo; const TargetLowering *TLI; + const TargetRegisterInfo *TRI; const TargetTransformInfo *TTI; const TargetLibraryInfo *TLInfo; const LoopInfo *LI; @@ -165,6 +190,15 @@ class TypePromotionTransaction; /// promotion for the current function. InstrToOrigTy PromotedInsts; + /// Keep track of instructions removed during promotion. + SetOfInstrs RemovedInsts; + + /// Keep track of sext chains based on their initial value. + DenseMap<Value *, Instruction *> SeenChainsForSExt; + + /// Keep track of SExt promoted. + ValueToSExts ValToSExtendedUses; + /// True if CFG is modified in any way. bool ModifiedDT; @@ -176,10 +210,11 @@ class TypePromotionTransaction; public: static char ID; // Pass identification, replacement for typeid - explicit CodeGenPrepare(const TargetMachine *TM = nullptr) - : FunctionPass(ID), TM(TM), TLI(nullptr), TTI(nullptr), DL(nullptr) { - initializeCodeGenPreparePass(*PassRegistry::getPassRegistry()); - } + CodeGenPrepare() + : FunctionPass(ID), TM(nullptr), TLI(nullptr), TTI(nullptr), + DL(nullptr) { + initializeCodeGenPreparePass(*PassRegistry::getPassRegistry()); + } bool runOnFunction(Function &F) override; StringRef getPassName() const override { return "CodeGen Prepare"; } @@ -200,13 +235,13 @@ class TypePromotionTransaction; void eliminateMostlyEmptyBlock(BasicBlock *BB); bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB, bool isPreheader); - bool optimizeBlock(BasicBlock &BB, bool& ModifiedDT); - bool optimizeInst(Instruction *I, bool& ModifiedDT); + bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT); + bool optimizeInst(Instruction *I, bool &ModifiedDT); bool optimizeMemoryInst(Instruction *I, Value *Addr, Type *AccessTy, unsigned AS); bool optimizeInlineAsmInst(CallInst *CS); - bool optimizeCallInst(CallInst *CI, bool& ModifiedDT); - bool moveExtToFormExtLoad(Instruction *&I); + bool optimizeCallInst(CallInst *CI, bool &ModifiedDT); + bool optimizeExt(Instruction *&I); bool optimizeExtUses(Instruction *I); bool optimizeLoadExt(LoadInst *I); bool optimizeSelectInst(SelectInst *SI); @@ -215,26 +250,32 @@ class TypePromotionTransaction; bool optimizeExtractElementInst(Instruction *Inst); bool dupRetToEnableTailCallOpts(BasicBlock *BB); bool placeDbgValues(Function &F); - bool sinkAndCmp(Function &F); - bool extLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI, - Instruction *&Inst, - const SmallVectorImpl<Instruction *> &Exts, - unsigned CreatedInstCost); + bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts, + LoadInst *&LI, Instruction *&Inst, bool HasPromoted); + bool tryToPromoteExts(TypePromotionTransaction &TPT, + const SmallVectorImpl<Instruction *> &Exts, + SmallVectorImpl<Instruction *> &ProfitablyMovedExts, + unsigned CreatedInstsCost = 0); + bool mergeSExts(Function &F); + bool performAddressTypePromotion( + Instruction *&Inst, + bool AllowPromotionWithoutCommonHeader, + bool HasPromoted, TypePromotionTransaction &TPT, + SmallVectorImpl<Instruction *> &SpeculativelyMovedExts); bool splitBranchCondition(Function &F); bool simplifyOffsetableRelocate(Instruction &I); + bool splitIndirectCriticalEdges(Function &F); }; } char CodeGenPrepare::ID = 0; -INITIALIZE_TM_PASS_BEGIN(CodeGenPrepare, "codegenprepare", - "Optimize for code generation", false, false) +INITIALIZE_PASS_BEGIN(CodeGenPrepare, DEBUG_TYPE, + "Optimize for code generation", false, false) INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) -INITIALIZE_TM_PASS_END(CodeGenPrepare, "codegenprepare", - "Optimize for code generation", false, false) +INITIALIZE_PASS_END(CodeGenPrepare, DEBUG_TYPE, + "Optimize for code generation", false, false) -FunctionPass *llvm::createCodeGenPreparePass(const TargetMachine *TM) { - return new CodeGenPrepare(TM); -} +FunctionPass *llvm::createCodeGenPreparePass() { return new CodeGenPrepare(); } bool CodeGenPrepare::runOnFunction(Function &F) { if (skipFunction(F)) @@ -250,8 +291,12 @@ bool CodeGenPrepare::runOnFunction(Function &F) { BPI.reset(); ModifiedDT = false; - if (TM) - TLI = TM->getSubtargetImpl(F)->getTargetLowering(); + if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) { + TM = &TPC->getTM<TargetMachine>(); + SubtargetInfo = TM->getSubtargetImpl(F); + TLI = SubtargetInfo->getTargetLowering(); + TRI = SubtargetInfo->getRegisterInfo(); + } TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); @@ -260,10 +305,10 @@ bool CodeGenPrepare::runOnFunction(Function &F) { if (ProfileGuidedSectionPrefix) { ProfileSummaryInfo *PSI = getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); - if (PSI->isFunctionEntryHot(&F)) + if (PSI->isFunctionHotInCallGraph(&F)) F.setSectionPrefix(".hot"); - else if (PSI->isFunctionEntryCold(&F)) - F.setSectionPrefix(".cold"); + else if (PSI->isFunctionColdInCallGraph(&F)) + F.setSectionPrefix(".unlikely"); } /// This optimization identifies DIV instructions that can be @@ -290,18 +335,19 @@ bool CodeGenPrepare::runOnFunction(Function &F) { // find a node corresponding to the value. EverMadeChange |= placeDbgValues(F); - // If there is a mask, compare against zero, and branch that can be combined - // into a single target instruction, push the mask and compare into branch - // users. Do this before OptimizeBlock -> OptimizeInst -> - // OptimizeCmpExpression, which perturbs the pattern being searched for. - if (!DisableBranchOpts) { - EverMadeChange |= sinkAndCmp(F); + if (!DisableBranchOpts) EverMadeChange |= splitBranchCondition(F); - } + + // Split some critical edges where one of the sources is an indirect branch, + // to help generate sane code for PHIs involving such edges. + EverMadeChange |= splitIndirectCriticalEdges(F); bool MadeChange = true; while (MadeChange) { MadeChange = false; + SeenChainsForSExt.clear(); + ValToSExtendedUses.clear(); + RemovedInsts.clear(); for (Function::iterator I = F.begin(); I != F.end(); ) { BasicBlock *BB = &*I++; bool ModifiedDTOnIteration = false; @@ -311,6 +357,13 @@ bool CodeGenPrepare::runOnFunction(Function &F) { if (ModifiedDTOnIteration) break; } + if (EnableTypePromotionMerge && !ValToSExtendedUses.empty()) + MadeChange |= mergeSExts(F); + + // Really free removed instructions during promotion. + for (Instruction *I : RemovedInsts) + I->deleteValue(); + EverMadeChange |= MadeChange; } @@ -432,6 +485,160 @@ BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) { return DestBB; } +// Return the unique indirectbr predecessor of a block. This may return null +// even if such a predecessor exists, if it's not useful for splitting. +// If a predecessor is found, OtherPreds will contain all other (non-indirectbr) +// predecessors of BB. +static BasicBlock * +findIBRPredecessor(BasicBlock *BB, SmallVectorImpl<BasicBlock *> &OtherPreds) { + // If the block doesn't have any PHIs, we don't care about it, since there's + // no point in splitting it. + PHINode *PN = dyn_cast<PHINode>(BB->begin()); + if (!PN) + return nullptr; + + // Verify we have exactly one IBR predecessor. + // Conservatively bail out if one of the other predecessors is not a "regular" + // terminator (that is, not a switch or a br). + BasicBlock *IBB = nullptr; + for (unsigned Pred = 0, E = PN->getNumIncomingValues(); Pred != E; ++Pred) { + BasicBlock *PredBB = PN->getIncomingBlock(Pred); + TerminatorInst *PredTerm = PredBB->getTerminator(); + switch (PredTerm->getOpcode()) { + case Instruction::IndirectBr: + if (IBB) + return nullptr; + IBB = PredBB; + break; + case Instruction::Br: + case Instruction::Switch: + OtherPreds.push_back(PredBB); + continue; + default: + return nullptr; + } + } + + return IBB; +} + +// Split critical edges where the source of the edge is an indirectbr +// instruction. This isn't always possible, but we can handle some easy cases. +// This is useful because MI is unable to split such critical edges, +// which means it will not be able to sink instructions along those edges. +// This is especially painful for indirect branches with many successors, where +// we end up having to prepare all outgoing values in the origin block. +// +// Our normal algorithm for splitting critical edges requires us to update +// the outgoing edges of the edge origin block, but for an indirectbr this +// is hard, since it would require finding and updating the block addresses +// the indirect branch uses. But if a block only has a single indirectbr +// predecessor, with the others being regular branches, we can do it in a +// different way. +// Say we have A -> D, B -> D, I -> D where only I -> D is an indirectbr. +// We can split D into D0 and D1, where D0 contains only the PHIs from D, +// and D1 is the D block body. We can then duplicate D0 as D0A and D0B, and +// create the following structure: +// A -> D0A, B -> D0A, I -> D0B, D0A -> D1, D0B -> D1 +bool CodeGenPrepare::splitIndirectCriticalEdges(Function &F) { + // Check whether the function has any indirectbrs, and collect which blocks + // they may jump to. Since most functions don't have indirect branches, + // this lowers the common case's overhead to O(Blocks) instead of O(Edges). + SmallSetVector<BasicBlock *, 16> Targets; + for (auto &BB : F) { + auto *IBI = dyn_cast<IndirectBrInst>(BB.getTerminator()); + if (!IBI) + continue; + + for (unsigned Succ = 0, E = IBI->getNumSuccessors(); Succ != E; ++Succ) + Targets.insert(IBI->getSuccessor(Succ)); + } + + if (Targets.empty()) + return false; + + bool Changed = false; + for (BasicBlock *Target : Targets) { + SmallVector<BasicBlock *, 16> OtherPreds; + BasicBlock *IBRPred = findIBRPredecessor(Target, OtherPreds); + // If we did not found an indirectbr, or the indirectbr is the only + // incoming edge, this isn't the kind of edge we're looking for. + if (!IBRPred || OtherPreds.empty()) + continue; + + // Don't even think about ehpads/landingpads. + Instruction *FirstNonPHI = Target->getFirstNonPHI(); + if (FirstNonPHI->isEHPad() || Target->isLandingPad()) + continue; + + BasicBlock *BodyBlock = Target->splitBasicBlock(FirstNonPHI, ".split"); + // It's possible Target was its own successor through an indirectbr. + // In this case, the indirectbr now comes from BodyBlock. + if (IBRPred == Target) + IBRPred = BodyBlock; + + // At this point Target only has PHIs, and BodyBlock has the rest of the + // block's body. Create a copy of Target that will be used by the "direct" + // preds. + ValueToValueMapTy VMap; + BasicBlock *DirectSucc = CloneBasicBlock(Target, VMap, ".clone", &F); + + for (BasicBlock *Pred : OtherPreds) { + // If the target is a loop to itself, then the terminator of the split + // block needs to be updated. + if (Pred == Target) + BodyBlock->getTerminator()->replaceUsesOfWith(Target, DirectSucc); + else + Pred->getTerminator()->replaceUsesOfWith(Target, DirectSucc); + } + + // Ok, now fix up the PHIs. We know the two blocks only have PHIs, and that + // they are clones, so the number of PHIs are the same. + // (a) Remove the edge coming from IBRPred from the "Direct" PHI + // (b) Leave that as the only edge in the "Indirect" PHI. + // (c) Merge the two in the body block. + BasicBlock::iterator Indirect = Target->begin(), + End = Target->getFirstNonPHI()->getIterator(); + BasicBlock::iterator Direct = DirectSucc->begin(); + BasicBlock::iterator MergeInsert = BodyBlock->getFirstInsertionPt(); + + assert(&*End == Target->getTerminator() && + "Block was expected to only contain PHIs"); + + while (Indirect != End) { + PHINode *DirPHI = cast<PHINode>(Direct); + PHINode *IndPHI = cast<PHINode>(Indirect); + + // Now, clean up - the direct block shouldn't get the indirect value, + // and vice versa. + DirPHI->removeIncomingValue(IBRPred); + Direct++; + + // Advance the pointer here, to avoid invalidation issues when the old + // PHI is erased. + Indirect++; + + PHINode *NewIndPHI = PHINode::Create(IndPHI->getType(), 1, "ind", IndPHI); + NewIndPHI->addIncoming(IndPHI->getIncomingValueForBlock(IBRPred), + IBRPred); + + // Create a PHI in the body block, to merge the direct and indirect + // predecessors. + PHINode *MergePHI = + PHINode::Create(IndPHI->getType(), 2, "merge", &*MergeInsert); + MergePHI->addIncoming(NewIndPHI, Target); + MergePHI->addIncoming(DirPHI, DirectSucc); + + IndPHI->replaceAllUsesWith(MergePHI); + IndPHI->eraseFromParent(); + } + + Changed = true; + } + + return Changed; +} + /// Eliminate blocks that contain only PHI nodes, debug info directives, and an /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split /// edges in ways that are non-optimal for isel. Start by eliminating these @@ -1090,6 +1297,83 @@ static bool OptimizeCmpExpression(CmpInst *CI, const TargetLowering *TLI) { return false; } +/// Duplicate and sink the given 'and' instruction into user blocks where it is +/// used in a compare to allow isel to generate better code for targets where +/// this operation can be combined. +/// +/// Return true if any changes are made. +static bool sinkAndCmp0Expression(Instruction *AndI, + const TargetLowering &TLI, + SetOfInstrs &InsertedInsts) { + // Double-check that we're not trying to optimize an instruction that was + // already optimized by some other part of this pass. + assert(!InsertedInsts.count(AndI) && + "Attempting to optimize already optimized and instruction"); + (void) InsertedInsts; + + // Nothing to do for single use in same basic block. + if (AndI->hasOneUse() && + AndI->getParent() == cast<Instruction>(*AndI->user_begin())->getParent()) + return false; + + // Try to avoid cases where sinking/duplicating is likely to increase register + // pressure. + if (!isa<ConstantInt>(AndI->getOperand(0)) && + !isa<ConstantInt>(AndI->getOperand(1)) && + AndI->getOperand(0)->hasOneUse() && AndI->getOperand(1)->hasOneUse()) + return false; + + for (auto *U : AndI->users()) { + Instruction *User = cast<Instruction>(U); + + // Only sink for and mask feeding icmp with 0. + if (!isa<ICmpInst>(User)) + return false; + + auto *CmpC = dyn_cast<ConstantInt>(User->getOperand(1)); + if (!CmpC || !CmpC->isZero()) + return false; + } + + if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI)) + return false; + + DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n"); + DEBUG(AndI->getParent()->dump()); + + // Push the 'and' into the same block as the icmp 0. There should only be + // one (icmp (and, 0)) in each block, since CSE/GVN should have removed any + // others, so we don't need to keep track of which BBs we insert into. + for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end(); + UI != E; ) { + Use &TheUse = UI.getUse(); + Instruction *User = cast<Instruction>(*UI); + + // Preincrement use iterator so we don't invalidate it. + ++UI; + + DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n"); + + // Keep the 'and' in the same place if the use is already in the same block. + Instruction *InsertPt = + User->getParent() == AndI->getParent() ? AndI : User; + Instruction *InsertedAnd = + BinaryOperator::Create(Instruction::And, AndI->getOperand(0), + AndI->getOperand(1), "", InsertPt); + // Propagate the debug info. + InsertedAnd->setDebugLoc(AndI->getDebugLoc()); + + // Replace a use of the 'and' with a use of the new 'and'. + TheUse = InsertedAnd; + ++NumAndUses; + DEBUG(User->getParent()->dump()); + } + + // We removed all uses, nuke the and. + AndI->eraseFromParent(); + return true; +} + /// Check if the candidates could be combined with a shift instruction, which /// includes: /// 1. Truncate instruction @@ -1278,519 +1562,6 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI, return MadeChange; } -// Translate a masked load intrinsic like -// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align, -// <16 x i1> %mask, <16 x i32> %passthru) -// to a chain of basic blocks, with loading element one-by-one if -// the appropriate mask bit is set -// -// %1 = bitcast i8* %addr to i32* -// %2 = extractelement <16 x i1> %mask, i32 0 -// %3 = icmp eq i1 %2, true -// br i1 %3, label %cond.load, label %else -// -//cond.load: ; preds = %0 -// %4 = getelementptr i32* %1, i32 0 -// %5 = load i32* %4 -// %6 = insertelement <16 x i32> undef, i32 %5, i32 0 -// br label %else -// -//else: ; preds = %0, %cond.load -// %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ] -// %7 = extractelement <16 x i1> %mask, i32 1 -// %8 = icmp eq i1 %7, true -// br i1 %8, label %cond.load1, label %else2 -// -//cond.load1: ; preds = %else -// %9 = getelementptr i32* %1, i32 1 -// %10 = load i32* %9 -// %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1 -// br label %else2 -// -//else2: ; preds = %else, %cond.load1 -// %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] -// %12 = extractelement <16 x i1> %mask, i32 2 -// %13 = icmp eq i1 %12, true -// br i1 %13, label %cond.load4, label %else5 -// -static void scalarizeMaskedLoad(CallInst *CI) { - Value *Ptr = CI->getArgOperand(0); - Value *Alignment = CI->getArgOperand(1); - Value *Mask = CI->getArgOperand(2); - Value *Src0 = CI->getArgOperand(3); - - unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); - VectorType *VecType = dyn_cast<VectorType>(CI->getType()); - assert(VecType && "Unexpected return type of masked load intrinsic"); - - Type *EltTy = CI->getType()->getVectorElementType(); - - IRBuilder<> Builder(CI->getContext()); - Instruction *InsertPt = CI; - BasicBlock *IfBlock = CI->getParent(); - BasicBlock *CondBlock = nullptr; - BasicBlock *PrevIfBlock = CI->getParent(); - - Builder.SetInsertPoint(InsertPt); - Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - - // Short-cut if the mask is all-true. - bool IsAllOnesMask = isa<Constant>(Mask) && - cast<Constant>(Mask)->isAllOnesValue(); - - if (IsAllOnesMask) { - Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal); - CI->replaceAllUsesWith(NewI); - CI->eraseFromParent(); - return; - } - - // Adjust alignment for the scalar instruction. - AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits()/8); - // Bitcast %addr fron i8* to EltTy* - Type *NewPtrType = - EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace()); - Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); - unsigned VectorWidth = VecType->getNumElements(); - - Value *UndefVal = UndefValue::get(VecType); - - // The result vector - Value *VResult = UndefVal; - - if (isa<ConstantVector>(Mask)) { - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) - continue; - Value *Gep = - Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); - LoadInst* Load = Builder.CreateAlignedLoad(Gep, AlignVal); - VResult = Builder.CreateInsertElement(VResult, Load, - Builder.getInt32(Idx)); - } - Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); - CI->replaceAllUsesWith(NewI); - CI->eraseFromParent(); - return; - } - - PHINode *Phi = nullptr; - Value *PrevPhi = UndefVal; - - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - - // Fill the "else" block, created in the previous iteration - // - // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] - // %mask_1 = extractelement <16 x i1> %mask, i32 Idx - // %to_load = icmp eq i1 %mask_1, true - // br i1 %to_load, label %cond.load, label %else - // - if (Idx > 0) { - Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); - Phi->addIncoming(VResult, CondBlock); - Phi->addIncoming(PrevPhi, PrevIfBlock); - PrevPhi = Phi; - VResult = Phi; - } - - Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, - ConstantInt::get(Predicate->getType(), 1)); - - // Create "cond" block - // - // %EltAddr = getelementptr i32* %1, i32 0 - // %Elt = load i32* %EltAddr - // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx - // - CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load"); - Builder.SetInsertPoint(InsertPt); - - Value *Gep = - Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); - LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal); - VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx)); - - // Create "else" block, fill it in the next iteration - BasicBlock *NewIfBlock = - CondBlock->splitBasicBlock(InsertPt->getIterator(), "else"); - Builder.SetInsertPoint(InsertPt); - Instruction *OldBr = IfBlock->getTerminator(); - BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); - OldBr->eraseFromParent(); - PrevIfBlock = IfBlock; - IfBlock = NewIfBlock; - } - - Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); - Phi->addIncoming(VResult, CondBlock); - Phi->addIncoming(PrevPhi, PrevIfBlock); - Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); - CI->replaceAllUsesWith(NewI); - CI->eraseFromParent(); -} - -// Translate a masked store intrinsic, like -// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align, -// <16 x i1> %mask) -// to a chain of basic blocks, that stores element one-by-one if -// the appropriate mask bit is set -// -// %1 = bitcast i8* %addr to i32* -// %2 = extractelement <16 x i1> %mask, i32 0 -// %3 = icmp eq i1 %2, true -// br i1 %3, label %cond.store, label %else -// -// cond.store: ; preds = %0 -// %4 = extractelement <16 x i32> %val, i32 0 -// %5 = getelementptr i32* %1, i32 0 -// store i32 %4, i32* %5 -// br label %else -// -// else: ; preds = %0, %cond.store -// %6 = extractelement <16 x i1> %mask, i32 1 -// %7 = icmp eq i1 %6, true -// br i1 %7, label %cond.store1, label %else2 -// -// cond.store1: ; preds = %else -// %8 = extractelement <16 x i32> %val, i32 1 -// %9 = getelementptr i32* %1, i32 1 -// store i32 %8, i32* %9 -// br label %else2 -// . . . -static void scalarizeMaskedStore(CallInst *CI) { - Value *Src = CI->getArgOperand(0); - Value *Ptr = CI->getArgOperand(1); - Value *Alignment = CI->getArgOperand(2); - Value *Mask = CI->getArgOperand(3); - - unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); - VectorType *VecType = dyn_cast<VectorType>(Src->getType()); - assert(VecType && "Unexpected data type in masked store intrinsic"); - - Type *EltTy = VecType->getElementType(); - - IRBuilder<> Builder(CI->getContext()); - Instruction *InsertPt = CI; - BasicBlock *IfBlock = CI->getParent(); - Builder.SetInsertPoint(InsertPt); - Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - - // Short-cut if the mask is all-true. - bool IsAllOnesMask = isa<Constant>(Mask) && - cast<Constant>(Mask)->isAllOnesValue(); - - if (IsAllOnesMask) { - Builder.CreateAlignedStore(Src, Ptr, AlignVal); - CI->eraseFromParent(); - return; - } - - // Adjust alignment for the scalar instruction. - AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits()/8); - // Bitcast %addr fron i8* to EltTy* - Type *NewPtrType = - EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace()); - Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); - unsigned VectorWidth = VecType->getNumElements(); - - if (isa<ConstantVector>(Mask)) { - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) - continue; - Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); - Value *Gep = - Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); - Builder.CreateAlignedStore(OneElt, Gep, AlignVal); - } - CI->eraseFromParent(); - return; - } - - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - - // Fill the "else" block, created in the previous iteration - // - // %mask_1 = extractelement <16 x i1> %mask, i32 Idx - // %to_store = icmp eq i1 %mask_1, true - // br i1 %to_store, label %cond.store, label %else - // - Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, - ConstantInt::get(Predicate->getType(), 1)); - - // Create "cond" block - // - // %OneElt = extractelement <16 x i32> %Src, i32 Idx - // %EltAddr = getelementptr i32* %1, i32 0 - // %store i32 %OneElt, i32* %EltAddr - // - BasicBlock *CondBlock = - IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store"); - Builder.SetInsertPoint(InsertPt); - - Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); - Value *Gep = - Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); - Builder.CreateAlignedStore(OneElt, Gep, AlignVal); - - // Create "else" block, fill it in the next iteration - BasicBlock *NewIfBlock = - CondBlock->splitBasicBlock(InsertPt->getIterator(), "else"); - Builder.SetInsertPoint(InsertPt); - Instruction *OldBr = IfBlock->getTerminator(); - BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); - OldBr->eraseFromParent(); - IfBlock = NewIfBlock; - } - CI->eraseFromParent(); -} - -// Translate a masked gather intrinsic like -// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4, -// <16 x i1> %Mask, <16 x i32> %Src) -// to a chain of basic blocks, with loading element one-by-one if -// the appropriate mask bit is set -// -// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind -// % Mask0 = extractelement <16 x i1> %Mask, i32 0 -// % ToLoad0 = icmp eq i1 % Mask0, true -// br i1 % ToLoad0, label %cond.load, label %else -// -// cond.load: -// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 -// % Load0 = load i32, i32* % Ptr0, align 4 -// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0 -// br label %else -// -// else: -// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0] -// % Mask1 = extractelement <16 x i1> %Mask, i32 1 -// % ToLoad1 = icmp eq i1 % Mask1, true -// br i1 % ToLoad1, label %cond.load1, label %else2 -// -// cond.load1: -// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 -// % Load1 = load i32, i32* % Ptr1, align 4 -// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1 -// br label %else2 -// . . . -// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src -// ret <16 x i32> %Result -static void scalarizeMaskedGather(CallInst *CI) { - Value *Ptrs = CI->getArgOperand(0); - Value *Alignment = CI->getArgOperand(1); - Value *Mask = CI->getArgOperand(2); - Value *Src0 = CI->getArgOperand(3); - - VectorType *VecType = dyn_cast<VectorType>(CI->getType()); - - assert(VecType && "Unexpected return type of masked load intrinsic"); - - IRBuilder<> Builder(CI->getContext()); - Instruction *InsertPt = CI; - BasicBlock *IfBlock = CI->getParent(); - BasicBlock *CondBlock = nullptr; - BasicBlock *PrevIfBlock = CI->getParent(); - Builder.SetInsertPoint(InsertPt); - unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); - - Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - - Value *UndefVal = UndefValue::get(VecType); - - // The result vector - Value *VResult = UndefVal; - unsigned VectorWidth = VecType->getNumElements(); - - // Shorten the way if the mask is a vector of constants. - bool IsConstMask = isa<ConstantVector>(Mask); - - if (IsConstMask) { - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) - continue; - Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), - "Ptr" + Twine(Idx)); - LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal, - "Load" + Twine(Idx)); - VResult = Builder.CreateInsertElement(VResult, Load, - Builder.getInt32(Idx), - "Res" + Twine(Idx)); - } - Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); - CI->replaceAllUsesWith(NewI); - CI->eraseFromParent(); - return; - } - - PHINode *Phi = nullptr; - Value *PrevPhi = UndefVal; - - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - - // Fill the "else" block, created in the previous iteration - // - // %Mask1 = extractelement <16 x i1> %Mask, i32 1 - // %ToLoad1 = icmp eq i1 %Mask1, true - // br i1 %ToLoad1, label %cond.load, label %else - // - if (Idx > 0) { - Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); - Phi->addIncoming(VResult, CondBlock); - Phi->addIncoming(PrevPhi, PrevIfBlock); - PrevPhi = Phi; - VResult = Phi; - } - - Value *Predicate = Builder.CreateExtractElement(Mask, - Builder.getInt32(Idx), - "Mask" + Twine(Idx)); - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, - ConstantInt::get(Predicate->getType(), 1), - "ToLoad" + Twine(Idx)); - - // Create "cond" block - // - // %EltAddr = getelementptr i32* %1, i32 0 - // %Elt = load i32* %EltAddr - // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx - // - CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load"); - Builder.SetInsertPoint(InsertPt); - - Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), - "Ptr" + Twine(Idx)); - LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal, - "Load" + Twine(Idx)); - VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx), - "Res" + Twine(Idx)); - - // Create "else" block, fill it in the next iteration - BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); - Builder.SetInsertPoint(InsertPt); - Instruction *OldBr = IfBlock->getTerminator(); - BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); - OldBr->eraseFromParent(); - PrevIfBlock = IfBlock; - IfBlock = NewIfBlock; - } - - Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); - Phi->addIncoming(VResult, CondBlock); - Phi->addIncoming(PrevPhi, PrevIfBlock); - Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); - CI->replaceAllUsesWith(NewI); - CI->eraseFromParent(); -} - -// Translate a masked scatter intrinsic, like -// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4, -// <16 x i1> %Mask) -// to a chain of basic blocks, that stores element one-by-one if -// the appropriate mask bit is set. -// -// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind -// % Mask0 = extractelement <16 x i1> % Mask, i32 0 -// % ToStore0 = icmp eq i1 % Mask0, true -// br i1 %ToStore0, label %cond.store, label %else -// -// cond.store: -// % Elt0 = extractelement <16 x i32> %Src, i32 0 -// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 -// store i32 %Elt0, i32* % Ptr0, align 4 -// br label %else -// -// else: -// % Mask1 = extractelement <16 x i1> % Mask, i32 1 -// % ToStore1 = icmp eq i1 % Mask1, true -// br i1 % ToStore1, label %cond.store1, label %else2 -// -// cond.store1: -// % Elt1 = extractelement <16 x i32> %Src, i32 1 -// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 -// store i32 % Elt1, i32* % Ptr1, align 4 -// br label %else2 -// . . . -static void scalarizeMaskedScatter(CallInst *CI) { - Value *Src = CI->getArgOperand(0); - Value *Ptrs = CI->getArgOperand(1); - Value *Alignment = CI->getArgOperand(2); - Value *Mask = CI->getArgOperand(3); - - assert(isa<VectorType>(Src->getType()) && - "Unexpected data type in masked scatter intrinsic"); - assert(isa<VectorType>(Ptrs->getType()) && - isa<PointerType>(Ptrs->getType()->getVectorElementType()) && - "Vector of pointers is expected in masked scatter intrinsic"); - - IRBuilder<> Builder(CI->getContext()); - Instruction *InsertPt = CI; - BasicBlock *IfBlock = CI->getParent(); - Builder.SetInsertPoint(InsertPt); - Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - - unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); - unsigned VectorWidth = Src->getType()->getVectorNumElements(); - - // Shorten the way if the mask is a vector of constants. - bool IsConstMask = isa<ConstantVector>(Mask); - - if (IsConstMask) { - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) - continue; - Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), - "Elt" + Twine(Idx)); - Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), - "Ptr" + Twine(Idx)); - Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); - } - CI->eraseFromParent(); - return; - } - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - // Fill the "else" block, created in the previous iteration - // - // % Mask1 = extractelement <16 x i1> % Mask, i32 Idx - // % ToStore = icmp eq i1 % Mask1, true - // br i1 % ToStore, label %cond.store, label %else - // - Value *Predicate = Builder.CreateExtractElement(Mask, - Builder.getInt32(Idx), - "Mask" + Twine(Idx)); - Value *Cmp = - Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, - ConstantInt::get(Predicate->getType(), 1), - "ToStore" + Twine(Idx)); - - // Create "cond" block - // - // % Elt1 = extractelement <16 x i32> %Src, i32 1 - // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 - // %store i32 % Elt1, i32* % Ptr1 - // - BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); - Builder.SetInsertPoint(InsertPt); - - Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), - "Elt" + Twine(Idx)); - Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), - "Ptr" + Twine(Idx)); - Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); - - // Create "else" block, fill it in the next iteration - BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); - Builder.SetInsertPoint(InsertPt); - Instruction *OldBr = IfBlock->getTerminator(); - BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); - OldBr->eraseFromParent(); - IfBlock = NewIfBlock; - } - CI->eraseFromParent(); -} - /// If counting leading or trailing zeros is an expensive operation and a zero /// input is defined, add a check for zero to avoid calling the intrinsic. /// @@ -1870,7 +1641,657 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros, return true; } -bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { +// This class provides helper functions to expand a memcmp library call into an +// inline expansion. +class MemCmpExpansion { + struct ResultBlock { + BasicBlock *BB; + PHINode *PhiSrc1; + PHINode *PhiSrc2; + ResultBlock(); + }; + + CallInst *CI; + ResultBlock ResBlock; + unsigned MaxLoadSize; + unsigned NumBlocks; + unsigned NumBlocksNonOneByte; + unsigned NumLoadsPerBlock; + std::vector<BasicBlock *> LoadCmpBlocks; + BasicBlock *EndBlock; + PHINode *PhiRes; + bool IsUsedForZeroCmp; + const DataLayout &DL; + IRBuilder<> Builder; + + unsigned calculateNumBlocks(unsigned Size); + void createLoadCmpBlocks(); + void createResultBlock(); + void setupResultBlockPHINodes(); + void setupEndBlockPHINodes(); + void emitLoadCompareBlock(unsigned Index, unsigned LoadSize, + unsigned GEPIndex); + Value *getCompareLoadPairs(unsigned Index, unsigned Size, + unsigned &NumBytesProcessed); + void emitLoadCompareBlockMultipleLoads(unsigned Index, unsigned Size, + unsigned &NumBytesProcessed); + void emitLoadCompareByteBlock(unsigned Index, unsigned GEPIndex); + void emitMemCmpResultBlock(); + Value *getMemCmpExpansionZeroCase(unsigned Size); + Value *getMemCmpEqZeroOneBlock(unsigned Size); + Value *getMemCmpOneBlock(unsigned Size); + unsigned getLoadSize(unsigned Size); + unsigned getNumLoads(unsigned Size); + +public: + MemCmpExpansion(CallInst *CI, uint64_t Size, unsigned MaxLoadSize, + unsigned NumLoadsPerBlock, const DataLayout &DL); + Value *getMemCmpExpansion(uint64_t Size); +}; + +MemCmpExpansion::ResultBlock::ResultBlock() + : BB(nullptr), PhiSrc1(nullptr), PhiSrc2(nullptr) {} + +// Initialize the basic block structure required for expansion of memcmp call +// with given maximum load size and memcmp size parameter. +// This structure includes: +// 1. A list of load compare blocks - LoadCmpBlocks. +// 2. An EndBlock, split from original instruction point, which is the block to +// return from. +// 3. ResultBlock, block to branch to for early exit when a +// LoadCmpBlock finds a difference. +MemCmpExpansion::MemCmpExpansion(CallInst *CI, uint64_t Size, + unsigned MaxLoadSize, unsigned LoadsPerBlock, + const DataLayout &TheDataLayout) + : CI(CI), MaxLoadSize(MaxLoadSize), NumLoadsPerBlock(LoadsPerBlock), + DL(TheDataLayout), Builder(CI) { + + // A memcmp with zero-comparison with only one block of load and compare does + // not need to set up any extra blocks. This case could be handled in the DAG, + // but since we have all of the machinery to flexibly expand any memcpy here, + // we choose to handle this case too to avoid fragmented lowering. + IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); + NumBlocks = calculateNumBlocks(Size); + if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || NumBlocks != 1) { + BasicBlock *StartBlock = CI->getParent(); + EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); + setupEndBlockPHINodes(); + createResultBlock(); + + // If return value of memcmp is not used in a zero equality, we need to + // calculate which source was larger. The calculation requires the + // two loaded source values of each load compare block. + // These will be saved in the phi nodes created by setupResultBlockPHINodes. + if (!IsUsedForZeroCmp) + setupResultBlockPHINodes(); + + // Create the number of required load compare basic blocks. + createLoadCmpBlocks(); + + // Update the terminator added by splitBasicBlock to branch to the first + // LoadCmpBlock. + StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); + } + + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); +} + +void MemCmpExpansion::createLoadCmpBlocks() { + for (unsigned i = 0; i < NumBlocks; i++) { + BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb", + EndBlock->getParent(), EndBlock); + LoadCmpBlocks.push_back(BB); + } +} + +void MemCmpExpansion::createResultBlock() { + ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block", + EndBlock->getParent(), EndBlock); +} + +// This function creates the IR instructions for loading and comparing 1 byte. +// It loads 1 byte from each source of the memcmp parameters with the given +// GEPIndex. It then subtracts the two loaded values and adds this result to the +// final phi node for selecting the memcmp result. +void MemCmpExpansion::emitLoadCompareByteBlock(unsigned Index, + unsigned GEPIndex) { + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + Builder.SetInsertPoint(LoadCmpBlocks[Index]); + Type *LoadSizeType = Type::getInt8Ty(CI->getContext()); + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using the GEPIndex. + if (GEPIndex != 0) { + Source1 = Builder.CreateGEP(LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, GEPIndex)); + Source2 = Builder.CreateGEP(LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, GEPIndex)); + } + + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext())); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext())); + Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); + + PhiRes->addIncoming(Diff, LoadCmpBlocks[Index]); + + if (Index < (LoadCmpBlocks.size() - 1)) { + // Early exit branch if difference found to EndBlock. Otherwise, continue to + // next LoadCmpBlock, + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, + ConstantInt::get(Diff->getType(), 0)); + BranchInst *CmpBr = + BranchInst::Create(EndBlock, LoadCmpBlocks[Index + 1], Cmp); + Builder.Insert(CmpBr); + } else { + // The last block has an unconditional branch to EndBlock. + BranchInst *CmpBr = BranchInst::Create(EndBlock); + Builder.Insert(CmpBr); + } +} + +unsigned MemCmpExpansion::getNumLoads(unsigned Size) { + return (Size / MaxLoadSize) + countPopulation(Size % MaxLoadSize); +} + +unsigned MemCmpExpansion::getLoadSize(unsigned Size) { + return MinAlign(PowerOf2Floor(Size), MaxLoadSize); +} + +/// Generate an equality comparison for one or more pairs of loaded values. +/// This is used in the case where the memcmp() call is compared equal or not +/// equal to zero. +Value *MemCmpExpansion::getCompareLoadPairs(unsigned Index, unsigned Size, + unsigned &NumBytesProcessed) { + std::vector<Value *> XorList, OrList; + Value *Diff; + + unsigned RemainingBytes = Size - NumBytesProcessed; + unsigned NumLoadsRemaining = getNumLoads(RemainingBytes); + unsigned NumLoads = std::min(NumLoadsRemaining, NumLoadsPerBlock); + + // For a single-block expansion, start inserting before the memcmp call. + if (LoadCmpBlocks.empty()) + Builder.SetInsertPoint(CI); + else + Builder.SetInsertPoint(LoadCmpBlocks[Index]); + + Value *Cmp = nullptr; + for (unsigned i = 0; i < NumLoads; ++i) { + unsigned LoadSize = getLoadSize(RemainingBytes); + unsigned GEPIndex = NumBytesProcessed / LoadSize; + NumBytesProcessed += LoadSize; + RemainingBytes -= LoadSize; + + Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8); + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + assert(LoadSize <= MaxLoadSize && "Unexpected load type"); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using the GEPIndex. + if (GEPIndex != 0) { + Source1 = Builder.CreateGEP(LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, GEPIndex)); + Source2 = Builder.CreateGEP(LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, GEPIndex)); + } + + // Get a constant or load a value for each source address. + Value *LoadSrc1 = nullptr; + if (auto *Source1C = dyn_cast<Constant>(Source1)) + LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL); + if (!LoadSrc1) + LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + + Value *LoadSrc2 = nullptr; + if (auto *Source2C = dyn_cast<Constant>(Source2)) + LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL); + if (!LoadSrc2) + LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (NumLoads != 1) { + if (LoadSizeType != MaxLoadType) { + LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); + } + // If we have multiple loads per block, we need to generate a composite + // comparison using xor+or. + Diff = Builder.CreateXor(LoadSrc1, LoadSrc2); + Diff = Builder.CreateZExt(Diff, MaxLoadType); + XorList.push_back(Diff); + } else { + // If there's only one load per block, we just compare the loaded values. + Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2); + } + } + + auto pairWiseOr = [&](std::vector<Value *> &InList) -> std::vector<Value *> { + std::vector<Value *> OutList; + for (unsigned i = 0; i < InList.size() - 1; i = i + 2) { + Value *Or = Builder.CreateOr(InList[i], InList[i + 1]); + OutList.push_back(Or); + } + if (InList.size() % 2 != 0) + OutList.push_back(InList.back()); + return OutList; + }; + + if (!Cmp) { + // Pairwise OR the XOR results. + OrList = pairWiseOr(XorList); + + // Pairwise OR the OR results until one result left. + while (OrList.size() != 1) { + OrList = pairWiseOr(OrList); + } + Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0)); + } + + return Cmp; +} + +void MemCmpExpansion::emitLoadCompareBlockMultipleLoads( + unsigned Index, unsigned Size, unsigned &NumBytesProcessed) { + Value *Cmp = getCompareLoadPairs(Index, Size, NumBytesProcessed); + + BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1)) + ? EndBlock + : LoadCmpBlocks[Index + 1]; + // Early exit branch if difference found to ResultBlock. Otherwise, + // continue to next LoadCmpBlock or EndBlock. + BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); + Builder.Insert(CmpBr); + + // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 + // since early exit to ResultBlock was not taken (no difference was found in + // any of the bytes). + if (Index == LoadCmpBlocks.size() - 1) { + Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); + PhiRes->addIncoming(Zero, LoadCmpBlocks[Index]); + } +} + +// This function creates the IR intructions for loading and comparing using the +// given LoadSize. It loads the number of bytes specified by LoadSize from each +// source of the memcmp parameters. It then does a subtract to see if there was +// a difference in the loaded values. If a difference is found, it branches +// with an early exit to the ResultBlock for calculating which source was +// larger. Otherwise, it falls through to the either the next LoadCmpBlock or +// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with +// a special case through emitLoadCompareByteBlock. The special handling can +// simply subtract the loaded values and add it to the result phi node. +void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, unsigned LoadSize, + unsigned GEPIndex) { + if (LoadSize == 1) { + MemCmpExpansion::emitLoadCompareByteBlock(Index, GEPIndex); + return; + } + + Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8); + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + assert(LoadSize <= MaxLoadSize && "Unexpected load type"); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + Builder.SetInsertPoint(LoadCmpBlocks[Index]); + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using the GEPIndex. + if (GEPIndex != 0) { + Source1 = Builder.CreateGEP(LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, GEPIndex)); + Source2 = Builder.CreateGEP(LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, GEPIndex)); + } + + // Load LoadSizeType from the base address. + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (DL.isLittleEndian()) { + Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), + Intrinsic::bswap, LoadSizeType); + LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); + LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); + } + + if (LoadSizeType != MaxLoadType) { + LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); + } + + // Add the loaded values to the phi nodes for calculating memcmp result only + // if result is not used in a zero equality. + if (!IsUsedForZeroCmp) { + ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[Index]); + ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[Index]); + } + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2); + BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1)) + ? EndBlock + : LoadCmpBlocks[Index + 1]; + // Early exit branch if difference found to ResultBlock. Otherwise, continue + // to next LoadCmpBlock or EndBlock. + BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp); + Builder.Insert(CmpBr); + + // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 + // since early exit to ResultBlock was not taken (no difference was found in + // any of the bytes). + if (Index == LoadCmpBlocks.size() - 1) { + Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); + PhiRes->addIncoming(Zero, LoadCmpBlocks[Index]); + } +} + +// This function populates the ResultBlock with a sequence to calculate the +// memcmp result. It compares the two loaded source values and returns -1 if +// src1 < src2 and 1 if src1 > src2. +void MemCmpExpansion::emitMemCmpResultBlock() { + // Special case: if memcmp result is used in a zero equality, result does not + // need to be calculated and can simply return 1. + if (IsUsedForZeroCmp) { + BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); + Builder.SetInsertPoint(ResBlock.BB, InsertPt); + Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1); + PhiRes->addIncoming(Res, ResBlock.BB); + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + return; + } + BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); + Builder.SetInsertPoint(ResBlock.BB, InsertPt); + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1, + ResBlock.PhiSrc2); + + Value *Res = + Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1), + ConstantInt::get(Builder.getInt32Ty(), 1)); + + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + PhiRes->addIncoming(Res, ResBlock.BB); +} + +unsigned MemCmpExpansion::calculateNumBlocks(unsigned Size) { + unsigned NumBlocks = 0; + bool HaveOneByteLoad = false; + unsigned RemainingSize = Size; + unsigned LoadSize = MaxLoadSize; + while (RemainingSize) { + if (LoadSize == 1) + HaveOneByteLoad = true; + NumBlocks += RemainingSize / LoadSize; + RemainingSize = RemainingSize % LoadSize; + LoadSize = LoadSize / 2; + } + NumBlocksNonOneByte = HaveOneByteLoad ? (NumBlocks - 1) : NumBlocks; + + if (IsUsedForZeroCmp) + NumBlocks = NumBlocks / NumLoadsPerBlock + + (NumBlocks % NumLoadsPerBlock != 0 ? 1 : 0); + + return NumBlocks; +} + +void MemCmpExpansion::setupResultBlockPHINodes() { + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + Builder.SetInsertPoint(ResBlock.BB); + ResBlock.PhiSrc1 = + Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src1"); + ResBlock.PhiSrc2 = + Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src2"); +} + +void MemCmpExpansion::setupEndBlockPHINodes() { + Builder.SetInsertPoint(&EndBlock->front()); + PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res"); +} + +Value *MemCmpExpansion::getMemCmpExpansionZeroCase(unsigned Size) { + unsigned NumBytesProcessed = 0; + // This loop populates each of the LoadCmpBlocks with the IR sequence to + // handle multiple loads per block. + for (unsigned i = 0; i < NumBlocks; ++i) + emitLoadCompareBlockMultipleLoads(i, Size, NumBytesProcessed); + + emitMemCmpResultBlock(); + return PhiRes; +} + +/// A memcmp expansion that compares equality with 0 and only has one block of +/// load and compare can bypass the compare, branch, and phi IR that is required +/// in the general case. +Value *MemCmpExpansion::getMemCmpEqZeroOneBlock(unsigned Size) { + unsigned NumBytesProcessed = 0; + Value *Cmp = getCompareLoadPairs(0, Size, NumBytesProcessed); + return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext())); +} + +/// A memcmp expansion that only has one block of load and compare can bypass +/// the compare, branch, and phi IR that is required in the general case. +Value *MemCmpExpansion::getMemCmpOneBlock(unsigned Size) { + assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block"); + + Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8); + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Load LoadSizeType from the base address. + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (DL.isLittleEndian() && Size != 1) { + Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), + Intrinsic::bswap, LoadSizeType); + LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); + LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); + } + + // TODO: Instead of comparing ULT, just subtract and return the difference? + Value *CmpNE = Builder.CreateICmpNE(LoadSrc1, LoadSrc2); + Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2); + Type *I32 = Builder.getInt32Ty(); + Value *Sel1 = Builder.CreateSelect(CmpULT, ConstantInt::get(I32, -1), + ConstantInt::get(I32, 1)); + return Builder.CreateSelect(CmpNE, Sel1, ConstantInt::get(I32, 0)); +} + +// This function expands the memcmp call into an inline expansion and returns +// the memcmp result. +Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size) { + if (IsUsedForZeroCmp) + return NumBlocks == 1 ? getMemCmpEqZeroOneBlock(Size) : + getMemCmpExpansionZeroCase(Size); + + // TODO: Handle more than one load pair per block in getMemCmpOneBlock(). + if (NumBlocks == 1 && NumLoadsPerBlock == 1) + return getMemCmpOneBlock(Size); + + // This loop calls emitLoadCompareBlock for comparing Size bytes of the two + // memcmp sources. It starts with loading using the maximum load size set by + // the target. It processes any remaining bytes using a load size which is the + // next smallest power of 2. + unsigned LoadSize = MaxLoadSize; + unsigned NumBytesToBeProcessed = Size; + unsigned Index = 0; + while (NumBytesToBeProcessed) { + // Calculate how many blocks we can create with the current load size. + unsigned NumBlocks = NumBytesToBeProcessed / LoadSize; + unsigned GEPIndex = (Size - NumBytesToBeProcessed) / LoadSize; + NumBytesToBeProcessed = NumBytesToBeProcessed % LoadSize; + + // For each NumBlocks, populate the instruction sequence for loading and + // comparing LoadSize bytes. + while (NumBlocks--) { + emitLoadCompareBlock(Index, LoadSize, GEPIndex); + Index++; + GEPIndex++; + } + // Get the next LoadSize to use. + LoadSize = LoadSize / 2; + } + + emitMemCmpResultBlock(); + return PhiRes; +} + +// This function checks to see if an expansion of memcmp can be generated. +// It checks for constant compare size that is less than the max inline size. +// If an expansion cannot occur, returns false to leave as a library call. +// Otherwise, the library call is replaced with a new IR instruction sequence. +/// We want to transform: +/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15) +/// To: +/// loadbb: +/// %0 = bitcast i32* %buffer2 to i8* +/// %1 = bitcast i32* %buffer1 to i8* +/// %2 = bitcast i8* %1 to i64* +/// %3 = bitcast i8* %0 to i64* +/// %4 = load i64, i64* %2 +/// %5 = load i64, i64* %3 +/// %6 = call i64 @llvm.bswap.i64(i64 %4) +/// %7 = call i64 @llvm.bswap.i64(i64 %5) +/// %8 = sub i64 %6, %7 +/// %9 = icmp ne i64 %8, 0 +/// br i1 %9, label %res_block, label %loadbb1 +/// res_block: ; preds = %loadbb2, +/// %loadbb1, %loadbb +/// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ] +/// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ] +/// %10 = icmp ult i64 %phi.src1, %phi.src2 +/// %11 = select i1 %10, i32 -1, i32 1 +/// br label %endblock +/// loadbb1: ; preds = %loadbb +/// %12 = bitcast i32* %buffer2 to i8* +/// %13 = bitcast i32* %buffer1 to i8* +/// %14 = bitcast i8* %13 to i32* +/// %15 = bitcast i8* %12 to i32* +/// %16 = getelementptr i32, i32* %14, i32 2 +/// %17 = getelementptr i32, i32* %15, i32 2 +/// %18 = load i32, i32* %16 +/// %19 = load i32, i32* %17 +/// %20 = call i32 @llvm.bswap.i32(i32 %18) +/// %21 = call i32 @llvm.bswap.i32(i32 %19) +/// %22 = zext i32 %20 to i64 +/// %23 = zext i32 %21 to i64 +/// %24 = sub i64 %22, %23 +/// %25 = icmp ne i64 %24, 0 +/// br i1 %25, label %res_block, label %loadbb2 +/// loadbb2: ; preds = %loadbb1 +/// %26 = bitcast i32* %buffer2 to i8* +/// %27 = bitcast i32* %buffer1 to i8* +/// %28 = bitcast i8* %27 to i16* +/// %29 = bitcast i8* %26 to i16* +/// %30 = getelementptr i16, i16* %28, i16 6 +/// %31 = getelementptr i16, i16* %29, i16 6 +/// %32 = load i16, i16* %30 +/// %33 = load i16, i16* %31 +/// %34 = call i16 @llvm.bswap.i16(i16 %32) +/// %35 = call i16 @llvm.bswap.i16(i16 %33) +/// %36 = zext i16 %34 to i64 +/// %37 = zext i16 %35 to i64 +/// %38 = sub i64 %36, %37 +/// %39 = icmp ne i64 %38, 0 +/// br i1 %39, label %res_block, label %loadbb3 +/// loadbb3: ; preds = %loadbb2 +/// %40 = bitcast i32* %buffer2 to i8* +/// %41 = bitcast i32* %buffer1 to i8* +/// %42 = getelementptr i8, i8* %41, i8 14 +/// %43 = getelementptr i8, i8* %40, i8 14 +/// %44 = load i8, i8* %42 +/// %45 = load i8, i8* %43 +/// %46 = zext i8 %44 to i32 +/// %47 = zext i8 %45 to i32 +/// %48 = sub i32 %46, %47 +/// br label %endblock +/// endblock: ; preds = %res_block, +/// %loadbb3 +/// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] +/// ret i32 %phi.res +static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, + const TargetLowering *TLI, const DataLayout *DL) { + NumMemCmpCalls++; + + // TTI call to check if target would like to expand memcmp. Also, get the + // MaxLoadSize. + unsigned MaxLoadSize; + if (!TTI->expandMemCmp(CI, MaxLoadSize)) + return false; + + // Early exit from expansion if -Oz. + if (CI->getFunction()->optForMinSize()) + return false; + + // Early exit from expansion if size is not a constant. + ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2)); + if (!SizeCast) { + NumMemCmpNotConstant++; + return false; + } + + // Early exit from expansion if size greater than max bytes to load. + uint64_t SizeVal = SizeCast->getZExtValue(); + unsigned NumLoads = 0; + unsigned RemainingSize = SizeVal; + unsigned LoadSize = MaxLoadSize; + while (RemainingSize) { + NumLoads += RemainingSize / LoadSize; + RemainingSize = RemainingSize % LoadSize; + LoadSize = LoadSize / 2; + } + + if (NumLoads > TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize())) { + NumMemCmpGreaterThanMax++; + return false; + } + + NumMemCmpInlined++; + + // MemCmpHelper object creates and sets up basic blocks required for + // expanding memcmp with size SizeVal. + unsigned NumLoadsPerBlock = MemCmpNumLoadsPerBlock; + MemCmpExpansion MemCmpHelper(CI, SizeVal, MaxLoadSize, NumLoadsPerBlock, *DL); + + Value *Res = MemCmpHelper.getMemCmpExpansion(SizeVal); + + // Replace call with result of expansion and erase call. + CI->replaceAllUsesWith(Res); + CI->eraseFromParent(); + + return true; +} + +bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { BasicBlock *BB = CI->getParent(); // Lower inline assembly if we can. @@ -1955,10 +2376,11 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { ConstantInt *RetVal = lowerObjectSizeCall(II, *DL, TLInfo, /*MustSucceed=*/true); // Substituting this can cause recursive simplifications, which can - // invalidate our iterator. Use a WeakVH to hold onto it in case this + // invalidate our iterator. Use a WeakTrackingVH to hold onto it in case + // this // happens. Value *CurValue = &*CurInstIterator; - WeakVH IterHandle(CurValue); + WeakTrackingVH IterHandle(CurValue); replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr); @@ -1970,39 +2392,6 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { } return true; } - case Intrinsic::masked_load: { - // Scalarize unsupported vector masked load - if (!TTI->isLegalMaskedLoad(CI->getType())) { - scalarizeMaskedLoad(CI); - ModifiedDT = true; - return true; - } - return false; - } - case Intrinsic::masked_store: { - if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) { - scalarizeMaskedStore(CI); - ModifiedDT = true; - return true; - } - return false; - } - case Intrinsic::masked_gather: { - if (!TTI->isLegalMaskedGather(CI->getType())) { - scalarizeMaskedGather(CI); - ModifiedDT = true; - return true; - } - return false; - } - case Intrinsic::masked_scatter: { - if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) { - scalarizeMaskedScatter(CI); - ModifiedDT = true; - return true; - } - return false; - } case Intrinsic::aarch64_stlxr: case Intrinsic::aarch64_stxr: { ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0)); @@ -2028,16 +2417,15 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { } if (TLI) { - // Unknown address space. - // TODO: Target hook to pick which address space the intrinsic cares - // about? - unsigned AddrSpace = ~0u; SmallVector<Value*, 2> PtrOps; Type *AccessTy; - if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy, AddrSpace)) - while (!PtrOps.empty()) - if (optimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy, AddrSpace)) + if (TLI->getAddrModeArguments(II, PtrOps, AccessTy)) + while (!PtrOps.empty()) { + Value *PtrVal = PtrOps.pop_back_val(); + unsigned AS = PtrVal->getType()->getPointerAddressSpace(); + if (optimizeMemoryInst(II, PtrVal, AccessTy, AS)) return true; + } } } @@ -2054,6 +2442,13 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { CI->eraseFromParent(); return true; } + + LibFunc Func; + if (TLInfo->getLibFunc(ImmutableCallSite(CI), Func) && + Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TLI, DL)) { + ModifiedDT = true; + return true; + } return false; } @@ -2168,11 +2563,11 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB) { // Conservatively require the attributes of the call to match those of the // return. Ignore noalias because it doesn't affect the call sequence. - AttributeSet CalleeAttrs = CS.getAttributes(); - if (AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex). - removeAttribute(Attribute::NoAlias) != - AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex). - removeAttribute(Attribute::NoAlias)) + AttributeList CalleeAttrs = CS.getAttributes(); + if (AttrBuilder(CalleeAttrs, AttributeList::ReturnIndex) + .removeAttribute(Attribute::NoAlias) != + AttrBuilder(CalleeAttrs, AttributeList::ReturnIndex) + .removeAttribute(Attribute::NoAlias)) continue; // Make sure the call instruction is followed by an unconditional branch to @@ -2561,25 +2956,30 @@ class TypePromotionTransaction { OperandsHider Hider; /// Keep track of the uses replaced, if any. UsesReplacer *Replacer; + /// Keep track of instructions removed. + SetOfInstrs &RemovedInsts; public: /// \brief Remove all reference of \p Inst and optinally replace all its /// uses with New. + /// \p RemovedInsts Keep track of the instructions removed by this Action. /// \pre If !Inst->use_empty(), then New != nullptr - InstructionRemover(Instruction *Inst, Value *New = nullptr) + InstructionRemover(Instruction *Inst, SetOfInstrs &RemovedInsts, + Value *New = nullptr) : TypePromotionAction(Inst), Inserter(Inst), Hider(Inst), - Replacer(nullptr) { + Replacer(nullptr), RemovedInsts(RemovedInsts) { if (New) Replacer = new UsesReplacer(Inst, New); DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n"); + RemovedInsts.insert(Inst); + /// The instructions removed here will be freed after completing + /// optimizeBlock() for all blocks as we need to keep track of the + /// removed instructions during promotion. Inst->removeFromParent(); } ~InstructionRemover() override { delete Replacer; } - /// \brief Really remove the instruction. - void commit() override { delete Inst; } - /// \brief Resurrect the instruction and reassign it to the proper uses if /// new value was provided when build this action. void undo() override { @@ -2588,6 +2988,7 @@ class TypePromotionTransaction { if (Replacer) Replacer->undo(); Hider.undo(); + RemovedInsts.erase(Inst); } }; @@ -2596,6 +2997,10 @@ public: /// The restoration point is a pointer to an action instead of an iterator /// because the iterator may be invalidated but not the pointer. typedef const TypePromotionAction *ConstRestorationPt; + + TypePromotionTransaction(SetOfInstrs &RemovedInsts) + : RemovedInsts(RemovedInsts) {} + /// Advocate every changes made in that transaction. void commit(); /// Undo all the changes made after the given point. @@ -2627,6 +3032,7 @@ private: /// The ordered list of actions made so far. SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions; typedef SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator CommitPt; + SetOfInstrs &RemovedInsts; }; void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx, @@ -2638,7 +3044,8 @@ void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx, void TypePromotionTransaction::eraseInstruction(Instruction *Inst, Value *NewVal) { Actions.push_back( - make_unique<TypePromotionTransaction::InstructionRemover>(Inst, NewVal)); + make_unique<TypePromotionTransaction::InstructionRemover>(Inst, + RemovedInsts, NewVal)); } void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst, @@ -2705,8 +3112,8 @@ void TypePromotionTransaction::rollback( /// This encapsulates the logic for matching the target-legal addressing modes. class AddressingModeMatcher { SmallVectorImpl<Instruction*> &AddrModeInsts; - const TargetMachine &TM; const TargetLowering &TLI; + const TargetRegisterInfo &TRI; const DataLayout &DL; /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and @@ -2731,14 +3138,14 @@ class AddressingModeMatcher { bool IgnoreProfitability; AddressingModeMatcher(SmallVectorImpl<Instruction *> &AMI, - const TargetMachine &TM, Type *AT, unsigned AS, + const TargetLowering &TLI, + const TargetRegisterInfo &TRI, + Type *AT, unsigned AS, Instruction *MI, ExtAddrMode &AM, const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT) - : AddrModeInsts(AMI), TM(TM), - TLI(*TM.getSubtargetImpl(*MI->getParent()->getParent()) - ->getTargetLowering()), + : AddrModeInsts(AMI), TLI(TLI), TRI(TRI), DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS), MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts), PromotedInsts(PromotedInsts), TPT(TPT) { @@ -2756,13 +3163,15 @@ public: static ExtAddrMode Match(Value *V, Type *AccessTy, unsigned AS, Instruction *MemoryInst, SmallVectorImpl<Instruction*> &AddrModeInsts, - const TargetMachine &TM, + const TargetLowering &TLI, + const TargetRegisterInfo &TRI, const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT) { ExtAddrMode Result; - bool Success = AddressingModeMatcher(AddrModeInsts, TM, AccessTy, AS, + bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, + AccessTy, AS, MemoryInst, Result, InsertedInsts, PromotedInsts, TPT).matchAddr(V, 0); (void)Success; assert(Success && "Couldn't select *anything*?"); @@ -3583,18 +3992,18 @@ bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) { /// Check to see if all uses of OpVal by the specified inline asm call are due /// to memory operands. If so, return true, otherwise return false. static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, - const TargetMachine &TM) { - const Function *F = CI->getParent()->getParent(); - const TargetLowering *TLI = TM.getSubtargetImpl(*F)->getTargetLowering(); - const TargetRegisterInfo *TRI = TM.getSubtargetImpl(*F)->getRegisterInfo(); + const TargetLowering &TLI, + const TargetRegisterInfo &TRI) { + const Function *F = CI->getFunction(); TargetLowering::AsmOperandInfoVector TargetConstraints = - TLI->ParseConstraints(F->getParent()->getDataLayout(), TRI, + TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI, ImmutableCallSite(CI)); + for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; // Compute the constraint code and ConstraintType to use. - TLI->ComputeConstraintToUse(OpInfo, SDValue()); + TLI.ComputeConstraintToUse(OpInfo, SDValue()); // If this asm operand is our Value*, and if it isn't an indirect memory // operand, we can't fold it! @@ -3607,13 +4016,18 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, return true; } +// Max number of memory uses to look at before aborting the search to conserve +// compile time. +static constexpr int MaxMemoryUsesToScan = 20; + /// Recursively walk all the uses of I until we find a memory use. /// If we find an obviously non-foldable instruction, return true. /// Add the ultimately found memory instructions to MemoryUses. static bool FindAllMemoryUses( Instruction *I, SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses, - SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetMachine &TM) { + SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetLowering &TLI, + const TargetRegisterInfo &TRI, int SeenInsts = 0) { // If we already considered this instruction, we're done. if (!ConsideredInsts.insert(I).second) return false; @@ -3626,8 +4040,12 @@ static bool FindAllMemoryUses( // Loop over all the uses, recursively processing them. for (Use &U : I->uses()) { - Instruction *UserI = cast<Instruction>(U.getUser()); + // Conservatively return true if we're seeing a large number or a deep chain + // of users. This avoids excessive compilation times in pathological cases. + if (SeenInsts++ >= MaxMemoryUsesToScan) + return true; + Instruction *UserI = cast<Instruction>(U.getUser()); if (LoadInst *LI = dyn_cast<LoadInst>(UserI)) { MemoryUses.push_back(std::make_pair(LI, U.getOperandNo())); continue; @@ -3635,11 +4053,28 @@ static bool FindAllMemoryUses( if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) { unsigned opNo = U.getOperandNo(); - if (opNo == 0) return true; // Storing addr, not into addr. + if (opNo != StoreInst::getPointerOperandIndex()) + return true; // Storing addr, not into addr. MemoryUses.push_back(std::make_pair(SI, opNo)); continue; } + if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UserI)) { + unsigned opNo = U.getOperandNo(); + if (opNo != AtomicRMWInst::getPointerOperandIndex()) + return true; // Storing addr, not into addr. + MemoryUses.push_back(std::make_pair(RMW, opNo)); + continue; + } + + if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(UserI)) { + unsigned opNo = U.getOperandNo(); + if (opNo != AtomicCmpXchgInst::getPointerOperandIndex()) + return true; // Storing addr, not into addr. + MemoryUses.push_back(std::make_pair(CmpX, opNo)); + continue; + } + if (CallInst *CI = dyn_cast<CallInst>(UserI)) { // If this is a cold call, we can sink the addressing calculation into // the cold path. See optimizeCallInst @@ -3650,12 +4085,13 @@ static bool FindAllMemoryUses( if (!IA) return true; // If this is a memory operand, we're cool, otherwise bail out. - if (!IsOperandAMemoryOperand(CI, IA, I, TM)) + if (!IsOperandAMemoryOperand(CI, IA, I, TLI, TRI)) return true; continue; } - if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TM)) + if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI, + SeenInsts)) return true; } @@ -3743,7 +4179,7 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, // the use is just a particularly nice way of sinking it. SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses; SmallPtrSet<Instruction*, 16> ConsideredInsts; - if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TM)) + if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI)) return false; // Has a non-memory, non-foldable use! // Now that we know that all uses of this instruction are part of a chain of @@ -3775,7 +4211,8 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, ExtAddrMode Result; TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); - AddressingModeMatcher Matcher(MatchedAddrModeInsts, TM, AddressAccessTy, AS, + AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, TRI, + AddressAccessTy, AS, MemoryInst, Result, InsertedInsts, PromotedInsts, TPT); Matcher.IgnoreProfitability = true; @@ -3839,84 +4276,70 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // Use a worklist to iteratively look through PHI nodes, and ensure that // the addressing mode obtained from the non-PHI roots of the graph // are equivalent. - Value *Consensus = nullptr; - unsigned NumUsesConsensus = 0; - bool IsNumUsesConsensusValid = false; + bool AddrModeFound = false; + bool PhiSeen = false; SmallVector<Instruction*, 16> AddrModeInsts; ExtAddrMode AddrMode; - TypePromotionTransaction TPT; + TypePromotionTransaction TPT(RemovedInsts); TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); while (!worklist.empty()) { Value *V = worklist.back(); worklist.pop_back(); - // Break use-def graph loops. - if (!Visited.insert(V).second) { - Consensus = nullptr; - break; - } + // We allow traversing cyclic Phi nodes. + // In case of success after this loop we ensure that traversing through + // Phi nodes ends up with all cases to compute address of the form + // BaseGV + Base + Scale * Index + Offset + // where Scale and Offset are constans and BaseGV, Base and Index + // are exactly the same Values in all cases. + // It means that BaseGV, Scale and Offset dominate our memory instruction + // and have the same value as they had in address computation represented + // as Phi. So we can safely sink address computation to memory instruction. + if (!Visited.insert(V).second) + continue; // For a PHI node, push all of its incoming values. if (PHINode *P = dyn_cast<PHINode>(V)) { for (Value *IncValue : P->incoming_values()) worklist.push_back(IncValue); + PhiSeen = true; continue; } // For non-PHIs, determine the addressing mode being computed. Note that // the result may differ depending on what other uses our candidate // addressing instructions might have. - SmallVector<Instruction*, 16> NewAddrModeInsts; + AddrModeInsts.clear(); ExtAddrMode NewAddrMode = AddressingModeMatcher::Match( - V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TM, - InsertedInsts, PromotedInsts, TPT); - - // This check is broken into two cases with very similar code to avoid using - // getNumUses() as much as possible. Some values have a lot of uses, so - // calling getNumUses() unconditionally caused a significant compile-time - // regression. - if (!Consensus) { - Consensus = V; - AddrMode = NewAddrMode; - AddrModeInsts = NewAddrModeInsts; - continue; - } else if (NewAddrMode == AddrMode) { - if (!IsNumUsesConsensusValid) { - NumUsesConsensus = Consensus->getNumUses(); - IsNumUsesConsensusValid = true; - } + V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI, + InsertedInsts, PromotedInsts, TPT); - // Ensure that the obtained addressing mode is equivalent to that obtained - // for all other roots of the PHI traversal. Also, when choosing one - // such root as representative, select the one with the most uses in order - // to keep the cost modeling heuristics in AddressingModeMatcher - // applicable. - unsigned NumUses = V->getNumUses(); - if (NumUses > NumUsesConsensus) { - Consensus = V; - NumUsesConsensus = NumUses; - AddrModeInsts = NewAddrModeInsts; - } + if (!AddrModeFound) { + AddrModeFound = true; + AddrMode = NewAddrMode; continue; } + if (NewAddrMode == AddrMode) + continue; - Consensus = nullptr; + AddrModeFound = false; break; } // If the addressing mode couldn't be determined, or if multiple different // ones were determined, bail out now. - if (!Consensus) { + if (!AddrModeFound) { TPT.rollback(LastKnownGood); return false; } TPT.commit(); // If all the instructions matched are already in this BB, don't do anything. - if (none_of(AddrModeInsts, [&](Value *V) { + // If we saw Phi node then it is not local definitely. + if (!PhiSeen && none_of(AddrModeInsts, [&](Value *V) { return IsNonLocalValue(V, MemoryInst->getParent()); - })) { + })) { DEBUG(dbgs() << "CGP: Found local addrmode: " << AddrMode << "\n"); return false; } @@ -3935,11 +4358,10 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for " << *MemoryInst << "\n"); if (SunkAddr->getType() != Addr->getType()) - SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType()); + SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType()); } else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() && TM && - TM->getSubtargetImpl(*MemoryInst->getParent()->getParent()) - ->useAA())) { + SubtargetInfo->useAA())) { // By default, we use the GEP-based method when AA is used later. This // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities. DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " @@ -3963,6 +4385,20 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, AddrMode.Scale = 0; } + // It is only safe to sign extend the BaseReg if we know that the math + // required to create it did not overflow before we extend it. Since + // the original IR value was tossed in favor of a constant back when + // the AddrMode was created we need to bail out gracefully if widths + // do not match instead of extending it. + // + // (See below for code to add the scale.) + if (AddrMode.Scale) { + Type *ScaledRegTy = AddrMode.ScaledReg->getType(); + if (cast<IntegerType>(IntPtrTy)->getBitWidth() > + cast<IntegerType>(ScaledRegTy)->getBitWidth()) + return false; + } + if (AddrMode.BaseGV) { if (ResultPtr) return false; @@ -3973,14 +4409,16 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // If the real base value actually came from an inttoptr, then the matcher // will look through it and provide only the integer value. In that case, // use it here. - if (!ResultPtr && AddrMode.BaseReg) { - ResultPtr = - Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(), "sunkaddr"); - AddrMode.BaseReg = nullptr; - } else if (!ResultPtr && AddrMode.Scale == 1) { - ResultPtr = - Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(), "sunkaddr"); - AddrMode.Scale = 0; + if (!DL->isNonIntegralPointerType(Addr->getType())) { + if (!ResultPtr && AddrMode.BaseReg) { + ResultPtr = Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(), + "sunkaddr"); + AddrMode.BaseReg = nullptr; + } else if (!ResultPtr && AddrMode.Scale == 1) { + ResultPtr = Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(), + "sunkaddr"); + AddrMode.Scale = 0; + } } if (!ResultPtr && @@ -4011,19 +4449,11 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Value *V = AddrMode.ScaledReg; if (V->getType() == IntPtrTy) { // done. - } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() < - cast<IntegerType>(V->getType())->getBitWidth()) { - V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr"); } else { - // It is only safe to sign extend the BaseReg if we know that the math - // required to create it did not overflow before we extend it. Since - // the original IR value was tossed in favor of a constant back when - // the AddrMode was created we need to bail out gracefully if widths - // do not match instead of extending it. - Instruction *I = dyn_cast_or_null<Instruction>(ResultIndex); - if (I && (ResultIndex != AddrMode.BaseReg)) - I->eraseFromParent(); - return false; + assert(cast<IntegerType>(IntPtrTy)->getBitWidth() < + cast<IntegerType>(V->getType())->getBitWidth() && + "We can't transform if ScaledReg is too narrow"); + V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr"); } if (AddrMode.Scale != 1) @@ -4042,7 +4472,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // We need to add this separately from the scale above to help with // SDAG consecutive load/store merging. if (ResultPtr->getType() != I8PtrTy) - ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); + ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy); ResultPtr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr"); } @@ -4053,14 +4483,27 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, SunkAddr = ResultPtr; } else { if (ResultPtr->getType() != I8PtrTy) - ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); + ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy); SunkAddr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr"); } if (SunkAddr->getType() != Addr->getType()) - SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType()); + SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType()); } } else { + // We'd require a ptrtoint/inttoptr down the line, which we can't do for + // non-integral pointers, so in that case bail out now. + Type *BaseTy = AddrMode.BaseReg ? AddrMode.BaseReg->getType() : nullptr; + Type *ScaleTy = AddrMode.Scale ? AddrMode.ScaledReg->getType() : nullptr; + PointerType *BasePtrTy = dyn_cast_or_null<PointerType>(BaseTy); + PointerType *ScalePtrTy = dyn_cast_or_null<PointerType>(ScaleTy); + if (DL->isNonIntegralPointerType(Addr->getType()) || + (BasePtrTy && DL->isNonIntegralPointerType(BasePtrTy)) || + (ScalePtrTy && DL->isNonIntegralPointerType(ScalePtrTy)) || + (AddrMode.BaseGV && + DL->isNonIntegralPointerType(AddrMode.BaseGV->getType()))) + return false; + DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " << *MemoryInst << "\n"); Type *IntPtrTy = DL->getIntPtrType(Addr->getType()); @@ -4140,9 +4583,9 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // using it. if (Repl->use_empty()) { // This can cause recursive deletion, which can invalidate our iterator. - // Use a WeakVH to hold onto it in case this happens. + // Use a WeakTrackingVH to hold onto it in case this happens. Value *CurValue = &*CurInstIterator; - WeakVH IterHandle(CurValue); + WeakTrackingVH IterHandle(CurValue); BasicBlock *BB = CurInstIterator->getParent(); RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo); @@ -4164,7 +4607,7 @@ bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) { bool MadeChange = false; const TargetRegisterInfo *TRI = - TM->getSubtargetImpl(*CS->getParent()->getParent())->getRegisterInfo(); + TM->getSubtargetImpl(*CS->getFunction())->getRegisterInfo(); TargetLowering::AsmOperandInfoVector TargetConstraints = TLI->ParseConstraints(*DL, TRI, CS); unsigned ArgNo = 0; @@ -4185,14 +4628,14 @@ bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) { return MadeChange; } -/// \brief Check if all the uses of \p Inst are equivalent (or free) zero or +/// \brief Check if all the uses of \p Val are equivalent (or free) zero or /// sign extensions. -static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) { - assert(!Inst->use_empty() && "Input must have at least one use"); - const Instruction *FirstUser = cast<Instruction>(*Inst->user_begin()); +static bool hasSameExtUse(Value *Val, const TargetLowering &TLI) { + assert(!Val->use_empty() && "Input must have at least one use"); + const Instruction *FirstUser = cast<Instruction>(*Val->user_begin()); bool IsSExt = isa<SExtInst>(FirstUser); Type *ExtTy = FirstUser->getType(); - for (const User *U : Inst->users()) { + for (const User *U : Val->users()) { const Instruction *UI = cast<Instruction>(U); if ((IsSExt && !isa<SExtInst>(UI)) || (!IsSExt && !isa<ZExtInst>(UI))) return false; @@ -4202,11 +4645,11 @@ static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) { continue; // If IsSExt is true, we are in this situation: - // a = Inst + // a = Val // b = sext ty1 a to ty2 // c = sext ty1 a to ty3 // Assuming ty2 is shorter than ty3, this could be turned into: - // a = Inst + // a = Val // b = sext ty1 a to ty2 // c = sext ty2 b to ty3 // However, the last sext is not free. @@ -4233,51 +4676,44 @@ static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) { return true; } -/// \brief Try to form ExtLd by promoting \p Exts until they reach a -/// load instruction. -/// If an ext(load) can be formed, it is returned via \p LI for the load -/// and \p Inst for the extension. -/// Otherwise LI == nullptr and Inst == nullptr. -/// When some promotion happened, \p TPT contains the proper state to -/// revert them. -/// -/// \return true when promoting was necessary to expose the ext(load) -/// opportunity, false otherwise. +/// \brief Try to speculatively promote extensions in \p Exts and continue +/// promoting through newly promoted operands recursively as far as doing so is +/// profitable. Save extensions profitably moved up, in \p ProfitablyMovedExts. +/// When some promotion happened, \p TPT contains the proper state to revert +/// them. /// -/// Example: -/// \code -/// %ld = load i32* %addr -/// %add = add nuw i32 %ld, 4 -/// %zext = zext i32 %add to i64 -/// \endcode -/// => -/// \code -/// %ld = load i32* %addr -/// %zext = zext i32 %ld to i64 -/// %add = add nuw i64 %zext, 4 -/// \encode -/// Thanks to the promotion, we can match zext(load i32*) to i64. -bool CodeGenPrepare::extLdPromotion(TypePromotionTransaction &TPT, - LoadInst *&LI, Instruction *&Inst, - const SmallVectorImpl<Instruction *> &Exts, - unsigned CreatedInstsCost = 0) { - // Iterate over all the extensions to see if one form an ext(load). +/// \return true if some promotion happened, false otherwise. +bool CodeGenPrepare::tryToPromoteExts( + TypePromotionTransaction &TPT, const SmallVectorImpl<Instruction *> &Exts, + SmallVectorImpl<Instruction *> &ProfitablyMovedExts, + unsigned CreatedInstsCost) { + bool Promoted = false; + + // Iterate over all the extensions to try to promote them. for (auto I : Exts) { - // Check if we directly have ext(load). - if ((LI = dyn_cast<LoadInst>(I->getOperand(0)))) { - Inst = I; - // No promotion happened here. - return false; + // Early check if we directly have ext(load). + if (isa<LoadInst>(I->getOperand(0))) { + ProfitablyMovedExts.push_back(I); + continue; } - // Check whether or not we want to do any promotion. + + // Check whether or not we want to do any promotion. The reason we have + // this check inside the for loop is to catch the case where an extension + // is directly fed by a load because in such case the extension can be moved + // up without any promotion on its operands. if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion) - continue; + return false; + // Get the action to perform the promotion. - TypePromotionHelper::Action TPH = TypePromotionHelper::getAction( - I, InsertedInsts, *TLI, PromotedInsts); + TypePromotionHelper::Action TPH = + TypePromotionHelper::getAction(I, InsertedInsts, *TLI, PromotedInsts); // Check if we can promote. - if (!TPH) + if (!TPH) { + // Save the current extension as we cannot move up through its operand. + ProfitablyMovedExts.push_back(I); continue; + } + // Save the current state. TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); @@ -4297,110 +4733,275 @@ bool CodeGenPrepare::extLdPromotion(TypePromotionTransaction &TPT, // one extension but leave one. However, we optimistically keep going, // because the new extension may be removed too. long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost; - TotalCreatedInstsCost -= ExtCost; + // FIXME: It would be possible to propagate a negative value instead of + // conservatively ceiling it to 0. + TotalCreatedInstsCost = + std::max((long long)0, (TotalCreatedInstsCost - ExtCost)); if (!StressExtLdPromotion && (TotalCreatedInstsCost > 1 || !isPromotedInstructionLegal(*TLI, *DL, PromotedVal))) { - // The promotion is not profitable, rollback to the previous state. + // This promotion is not profitable, rollback to the previous state, and + // save the current extension in ProfitablyMovedExts as the latest + // speculative promotion turned out to be unprofitable. + TPT.rollback(LastKnownGood); + ProfitablyMovedExts.push_back(I); + continue; + } + // Continue promoting NewExts as far as doing so is profitable. + SmallVector<Instruction *, 2> NewlyMovedExts; + (void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost); + bool NewPromoted = false; + for (auto ExtInst : NewlyMovedExts) { + Instruction *MovedExt = cast<Instruction>(ExtInst); + Value *ExtOperand = MovedExt->getOperand(0); + // If we have reached to a load, we need this extra profitability check + // as it could potentially be merged into an ext(load). + if (isa<LoadInst>(ExtOperand) && + !(StressExtLdPromotion || NewCreatedInstsCost <= ExtCost || + (ExtOperand->hasOneUse() || hasSameExtUse(ExtOperand, *TLI)))) + continue; + + ProfitablyMovedExts.push_back(MovedExt); + NewPromoted = true; + } + + // If none of speculative promotions for NewExts is profitable, rollback + // and save the current extension (I) as the last profitable extension. + if (!NewPromoted) { TPT.rollback(LastKnownGood); + ProfitablyMovedExts.push_back(I); continue; } // The promotion is profitable. - // Check if it exposes an ext(load). - (void)extLdPromotion(TPT, LI, Inst, NewExts, TotalCreatedInstsCost); - if (LI && (StressExtLdPromotion || NewCreatedInstsCost <= ExtCost || - // If we have created a new extension, i.e., now we have two - // extensions. We must make sure one of them is merged with - // the load, otherwise we may degrade the code quality. - (LI->hasOneUse() || hasSameExtUse(LI, *TLI)))) - // Promotion happened. - return true; - // If this does not help to expose an ext(load) then, rollback. - TPT.rollback(LastKnownGood); + Promoted = true; } - // None of the extension can form an ext(load). - LI = nullptr; - Inst = nullptr; - return false; + return Promoted; +} + +/// Merging redundant sexts when one is dominating the other. +bool CodeGenPrepare::mergeSExts(Function &F) { + DominatorTree DT(F); + bool Changed = false; + for (auto &Entry : ValToSExtendedUses) { + SExts &Insts = Entry.second; + SExts CurPts; + for (Instruction *Inst : Insts) { + if (RemovedInsts.count(Inst) || !isa<SExtInst>(Inst) || + Inst->getOperand(0) != Entry.first) + continue; + bool inserted = false; + for (auto &Pt : CurPts) { + if (DT.dominates(Inst, Pt)) { + Pt->replaceAllUsesWith(Inst); + RemovedInsts.insert(Pt); + Pt->removeFromParent(); + Pt = Inst; + inserted = true; + Changed = true; + break; + } + if (!DT.dominates(Pt, Inst)) + // Give up if we need to merge in a common dominator as the + // expermients show it is not profitable. + continue; + Inst->replaceAllUsesWith(Pt); + RemovedInsts.insert(Inst); + Inst->removeFromParent(); + inserted = true; + Changed = true; + break; + } + if (!inserted) + CurPts.push_back(Inst); + } + } + return Changed; +} + +/// Return true, if an ext(load) can be formed from an extension in +/// \p MovedExts. +bool CodeGenPrepare::canFormExtLd( + const SmallVectorImpl<Instruction *> &MovedExts, LoadInst *&LI, + Instruction *&Inst, bool HasPromoted) { + for (auto *MovedExtInst : MovedExts) { + if (isa<LoadInst>(MovedExtInst->getOperand(0))) { + LI = cast<LoadInst>(MovedExtInst->getOperand(0)); + Inst = MovedExtInst; + break; + } + } + if (!LI) + return false; + + // If they're already in the same block, there's nothing to do. + // Make the cheap checks first if we did not promote. + // If we promoted, we need to check if it is indeed profitable. + if (!HasPromoted && LI->getParent() == Inst->getParent()) + return false; + + return TLI->isExtLoad(LI, Inst, *DL); } /// Move a zext or sext fed by a load into the same basic block as the load, /// unless conditions are unfavorable. This allows SelectionDAG to fold the /// extend into the load. -/// \p I[in/out] the extension may be modified during the process if some -/// promotions apply. /// -bool CodeGenPrepare::moveExtToFormExtLoad(Instruction *&I) { - // ExtLoad formation infrastructure requires TLI to be effective. +/// E.g., +/// \code +/// %ld = load i32* %addr +/// %add = add nuw i32 %ld, 4 +/// %zext = zext i32 %add to i64 +// \endcode +/// => +/// \code +/// %ld = load i32* %addr +/// %zext = zext i32 %ld to i64 +/// %add = add nuw i64 %zext, 4 +/// \encode +/// Note that the promotion in %add to i64 is done in tryToPromoteExts(), which +/// allow us to match zext(load i32*) to i64. +/// +/// Also, try to promote the computations used to obtain a sign extended +/// value used into memory accesses. +/// E.g., +/// \code +/// a = add nsw i32 b, 3 +/// d = sext i32 a to i64 +/// e = getelementptr ..., i64 d +/// \endcode +/// => +/// \code +/// f = sext i32 b to i64 +/// a = add nsw i64 f, 3 +/// e = getelementptr ..., i64 a +/// \endcode +/// +/// \p Inst[in/out] the extension may be modified during the process if some +/// promotions apply. +bool CodeGenPrepare::optimizeExt(Instruction *&Inst) { + // ExtLoad formation and address type promotion infrastructure requires TLI to + // be effective. if (!TLI) return false; - // Try to promote a chain of computation if it allows to form - // an extended load. - TypePromotionTransaction TPT; + bool AllowPromotionWithoutCommonHeader = false; + /// See if it is an interesting sext operations for the address type + /// promotion before trying to promote it, e.g., the ones with the right + /// type and used in memory accesses. + bool ATPConsiderable = TTI->shouldConsiderAddressTypePromotion( + *Inst, AllowPromotionWithoutCommonHeader); + TypePromotionTransaction TPT(RemovedInsts); TypePromotionTransaction::ConstRestorationPt LastKnownGood = - TPT.getRestorationPoint(); + TPT.getRestorationPoint(); SmallVector<Instruction *, 1> Exts; - Exts.push_back(I); + SmallVector<Instruction *, 2> SpeculativelyMovedExts; + Exts.push_back(Inst); + + bool HasPromoted = tryToPromoteExts(TPT, Exts, SpeculativelyMovedExts); + // Look for a load being extended. LoadInst *LI = nullptr; - Instruction *OldExt = I; - bool HasPromoted = extLdPromotion(TPT, LI, I, Exts); - if (!LI || !I) { - assert(!HasPromoted && !LI && "If we did not match any load instruction " - "the code must remain the same"); - I = OldExt; - return false; + Instruction *ExtFedByLoad; + + // Try to promote a chain of computation if it allows to form an extended + // load. + if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) { + assert(LI && ExtFedByLoad && "Expect a valid load and extension"); + TPT.commit(); + // Move the extend into the same block as the load + ExtFedByLoad->removeFromParent(); + ExtFedByLoad->insertAfter(LI); + // CGP does not check if the zext would be speculatively executed when moved + // to the same basic block as the load. Preserving its original location + // would pessimize the debugging experience, as well as negatively impact + // the quality of sample pgo. We don't want to use "line 0" as that has a + // size cost in the line-table section and logically the zext can be seen as + // part of the load. Therefore we conservatively reuse the same debug + // location for the load and the zext. + ExtFedByLoad->setDebugLoc(LI->getDebugLoc()); + ++NumExtsMoved; + Inst = ExtFedByLoad; + return true; } - // If they're already in the same block, there's nothing to do. - // Make the cheap checks first if we did not promote. - // If we promoted, we need to check if it is indeed profitable. - if (!HasPromoted && LI->getParent() == I->getParent()) - return false; - - EVT VT = TLI->getValueType(*DL, I->getType()); - EVT LoadVT = TLI->getValueType(*DL, LI->getType()); + // Continue promoting SExts if known as considerable depending on targets. + if (ATPConsiderable && + performAddressTypePromotion(Inst, AllowPromotionWithoutCommonHeader, + HasPromoted, TPT, SpeculativelyMovedExts)) + return true; - // If the load has other users and the truncate is not free, this probably - // isn't worthwhile. - if (!LI->hasOneUse() && - (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) && - !TLI->isTruncateFree(I->getType(), LI->getType())) { - I = OldExt; - TPT.rollback(LastKnownGood); - return false; - } + TPT.rollback(LastKnownGood); + return false; +} - // Check whether the target supports casts folded into loads. - unsigned LType; - if (isa<ZExtInst>(I)) - LType = ISD::ZEXTLOAD; - else { - assert(isa<SExtInst>(I) && "Unexpected ext type!"); - LType = ISD::SEXTLOAD; - } - if (!TLI->isLoadExtLegal(LType, VT, LoadVT)) { - I = OldExt; - TPT.rollback(LastKnownGood); +// Perform address type promotion if doing so is profitable. +// If AllowPromotionWithoutCommonHeader == false, we should find other sext +// instructions that sign extended the same initial value. However, if +// AllowPromotionWithoutCommonHeader == true, we expect promoting the +// extension is just profitable. +bool CodeGenPrepare::performAddressTypePromotion( + Instruction *&Inst, bool AllowPromotionWithoutCommonHeader, + bool HasPromoted, TypePromotionTransaction &TPT, + SmallVectorImpl<Instruction *> &SpeculativelyMovedExts) { + bool Promoted = false; + SmallPtrSet<Instruction *, 1> UnhandledExts; + bool AllSeenFirst = true; + for (auto I : SpeculativelyMovedExts) { + Value *HeadOfChain = I->getOperand(0); + DenseMap<Value *, Instruction *>::iterator AlreadySeen = + SeenChainsForSExt.find(HeadOfChain); + // If there is an unhandled SExt which has the same header, try to promote + // it as well. + if (AlreadySeen != SeenChainsForSExt.end()) { + if (AlreadySeen->second != nullptr) + UnhandledExts.insert(AlreadySeen->second); + AllSeenFirst = false; + } + } + + if (!AllSeenFirst || (AllowPromotionWithoutCommonHeader && + SpeculativelyMovedExts.size() == 1)) { + TPT.commit(); + if (HasPromoted) + Promoted = true; + for (auto I : SpeculativelyMovedExts) { + Value *HeadOfChain = I->getOperand(0); + SeenChainsForSExt[HeadOfChain] = nullptr; + ValToSExtendedUses[HeadOfChain].push_back(I); + } + // Update Inst as promotion happen. + Inst = SpeculativelyMovedExts.pop_back_val(); + } else { + // This is the first chain visited from the header, keep the current chain + // as unhandled. Defer to promote this until we encounter another SExt + // chain derived from the same header. + for (auto I : SpeculativelyMovedExts) { + Value *HeadOfChain = I->getOperand(0); + SeenChainsForSExt[HeadOfChain] = Inst; + } return false; } - // Move the extend into the same block as the load, so that SelectionDAG - // can fold it. - TPT.commit(); - I->removeFromParent(); - I->insertAfter(LI); - // CGP does not check if the zext would be speculatively executed when moved - // to the same basic block as the load. Preserving its original location would - // pessimize the debugging experience, as well as negatively impact the - // quality of sample pgo. We don't want to use "line 0" as that has a - // size cost in the line-table section and logically the zext can be seen as - // part of the load. Therefore we conservatively reuse the same debug location - // for the load and the zext. - I->setDebugLoc(LI->getDebugLoc()); - ++NumExtsMoved; - return true; + if (!AllSeenFirst && !UnhandledExts.empty()) + for (auto VisitedSExt : UnhandledExts) { + if (RemovedInsts.count(VisitedSExt)) + continue; + TypePromotionTransaction TPT(RemovedInsts); + SmallVector<Instruction *, 1> Exts; + SmallVector<Instruction *, 2> Chains; + Exts.push_back(VisitedSExt); + bool HasPromoted = tryToPromoteExts(TPT, Exts, Chains); + TPT.commit(); + if (HasPromoted) + Promoted = true; + for (auto I : Chains) { + Value *HeadOfChain = I->getOperand(0); + // Mark this as handled. + SeenChainsForSExt[HeadOfChain] = nullptr; + ValToSExtendedUses[HeadOfChain].push_back(I); + } + } + return Promoted; } bool CodeGenPrepare::optimizeExtUses(Instruction *I) { @@ -4534,13 +5135,10 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) { !(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy())) return false; - // Skip loads we've already transformed or have no reason to transform. - if (Load->hasOneUse()) { - User *LoadUser = *Load->user_begin(); - if (cast<Instruction>(LoadUser)->getParent() == Load->getParent() && - !dyn_cast<PHINode>(LoadUser)) - return false; - } + // Skip loads we've already transformed. + if (Load->hasOneUse() && + InsertedInsts.count(cast<Instruction>(*Load->user_begin()))) + return false; // Look at all uses of Load, looking through phis, to determine how many bits // of the loaded value are needed. @@ -4590,16 +5188,14 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) { if (!ShlC) return false; uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1); - auto ShlDemandBits = APInt::getAllOnesValue(BitWidth).lshr(ShiftAmt); - DemandBits |= ShlDemandBits; + DemandBits.setLowBits(BitWidth - ShiftAmt); break; } case llvm::Instruction::Trunc: { EVT TruncVT = TLI->getValueType(*DL, I->getType()); unsigned TruncBitWidth = TruncVT.getSizeInBits(); - auto TruncBits = APInt::getAllOnesValue(TruncBitWidth).zext(BitWidth); - DemandBits |= TruncBits; + DemandBits.setLowBits(TruncBitWidth); break; } @@ -4620,7 +5216,7 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) { // // Also avoid hoisting if we didn't see any ands with the exact DemandBits // mask, since these are the only ands that will be removed by isel. - if (ActiveBits <= 1 || !APIntOps::isMask(ActiveBits, DemandBits) || + if (ActiveBits <= 1 || !DemandBits.isMask(ActiveBits) || WidestAndBits != DemandBits) return false; @@ -4636,6 +5232,9 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) { IRBuilder<> Builder(Load->getNextNode()); auto *NewAnd = dyn_cast<Instruction>( Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits))); + // Mark this instruction as "inserted by CGP", so that other + // optimizations don't touch it. + InsertedInsts.insert(NewAnd); // Replace all uses of load with new and (except for the use of load in the // new and itself). @@ -4985,7 +5584,7 @@ bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) { auto *ExtInst = CastInst::Create(ExtType, Cond, NewType); ExtInst->insertBefore(SI); SI->setCondition(ExtInst); - for (SwitchInst::CaseIt Case : SI->cases()) { + for (auto Case : SI->cases()) { APInt NarrowConst = Case.getCaseValue()->getValue(); APInt WideConst = (ExtType == Instruction::ZExt) ? NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth); @@ -4995,6 +5594,7 @@ bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) { return true; } + namespace { /// \brief Helper class to promote a scalar operation to a vector one. /// This class is used to move downward extractelement transition. @@ -5473,7 +6073,7 @@ static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL, return true; } -bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) { +bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) { // Bail out if we inserted the instruction to prevent optimizations from // stepping on each other's toes. if (InsertedInsts.count(I)) @@ -5483,7 +6083,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) { // It is possible for very late stage optimizations (such as SimplifyCFG) // to introduce PHI nodes too late to be cleaned up. If we detect such a // trivial PHI, go ahead and zap it here. - if (Value *V = SimplifyInstruction(P, *DL, TLInfo, nullptr)) { + if (Value *V = SimplifyInstruction(P, {*DL, TLInfo})) { P->replaceAllUsesWith(V); P->eraseFromParent(); ++NumPHIsElim; @@ -5514,7 +6114,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) { TargetLowering::TypeExpandInteger) { return SinkCast(CI); } else { - bool MadeChange = moveExtToFormExtLoad(I); + bool MadeChange = optimizeExt(I); return MadeChange | optimizeExtUses(I); } } @@ -5548,8 +6148,24 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) { return false; } + if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) { + unsigned AS = RMW->getPointerAddressSpace(); + return optimizeMemoryInst(I, RMW->getPointerOperand(), + RMW->getType(), AS); + } + + if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(I)) { + unsigned AS = CmpX->getPointerAddressSpace(); + return optimizeMemoryInst(I, CmpX->getPointerOperand(), + CmpX->getCompareOperand()->getType(), AS); + } + BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I); + if (BinOp && (BinOp->getOpcode() == Instruction::And) && + EnableAndCmpSinking && TLI) + return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts); + if (BinOp && (BinOp->getOpcode() == Instruction::AShr || BinOp->getOpcode() == Instruction::LShr)) { ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)); @@ -5612,7 +6228,7 @@ static bool makeBitReverse(Instruction &I, const DataLayout &DL, // In this pass we look for GEP and cast instructions that are used // across basic blocks and rewrite them to improve basic-block-at-a-time // selection. -bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool& ModifiedDT) { +bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) { SunkAddrs.clear(); bool MadeChange = false; @@ -5679,68 +6295,6 @@ bool CodeGenPrepare::placeDbgValues(Function &F) { return MadeChange; } -// If there is a sequence that branches based on comparing a single bit -// against zero that can be combined into a single instruction, and the -// target supports folding these into a single instruction, sink the -// mask and compare into the branch uses. Do this before OptimizeBlock -> -// OptimizeInst -> OptimizeCmpExpression, which perturbs the pattern being -// searched for. -bool CodeGenPrepare::sinkAndCmp(Function &F) { - if (!EnableAndCmpSinking) - return false; - if (!TLI || !TLI->isMaskAndBranchFoldingLegal()) - return false; - bool MadeChange = false; - for (BasicBlock &BB : F) { - // Does this BB end with the following? - // %andVal = and %val, #single-bit-set - // %icmpVal = icmp %andResult, 0 - // br i1 %cmpVal label %dest1, label %dest2" - BranchInst *Brcc = dyn_cast<BranchInst>(BB.getTerminator()); - if (!Brcc || !Brcc->isConditional()) - continue; - ICmpInst *Cmp = dyn_cast<ICmpInst>(Brcc->getOperand(0)); - if (!Cmp || Cmp->getParent() != &BB) - continue; - ConstantInt *Zero = dyn_cast<ConstantInt>(Cmp->getOperand(1)); - if (!Zero || !Zero->isZero()) - continue; - Instruction *And = dyn_cast<Instruction>(Cmp->getOperand(0)); - if (!And || And->getOpcode() != Instruction::And || And->getParent() != &BB) - continue; - ConstantInt* Mask = dyn_cast<ConstantInt>(And->getOperand(1)); - if (!Mask || !Mask->getUniqueInteger().isPowerOf2()) - continue; - DEBUG(dbgs() << "found and; icmp ?,0; brcc\n"); DEBUG(BB.dump()); - - // Push the "and; icmp" for any users that are conditional branches. - // Since there can only be one branch use per BB, we don't need to keep - // track of which BBs we insert into. - for (Use &TheUse : Cmp->uses()) { - // Find brcc use. - BranchInst *BrccUser = dyn_cast<BranchInst>(TheUse); - if (!BrccUser || !BrccUser->isConditional()) - continue; - BasicBlock *UserBB = BrccUser->getParent(); - if (UserBB == &BB) continue; - DEBUG(dbgs() << "found Brcc use\n"); - - // Sink the "and; icmp" to use. - MadeChange = true; - BinaryOperator *NewAnd = - BinaryOperator::CreateAnd(And->getOperand(0), And->getOperand(1), "", - BrccUser); - CmpInst *NewCmp = - CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(), NewAnd, Zero, - "", BrccUser); - TheUse = NewCmp; - ++NumAndCmpsMoved; - DEBUG(BrccUser->getParent()->dump()); - } - } - return MadeChange; -} - /// \brief Scale down both weights to fit into uint32_t. static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) { uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse; @@ -5833,7 +6387,7 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) { } // Update PHI nodes in both successors. The original BB needs to be - // replaced in one succesor's PHI nodes, because the branch comes now from + // replaced in one successor's PHI nodes, because the branch comes now from // the newly generated BB (NewBB). In the other successor we need to add one // incoming edge to the PHI nodes, because both branch instructions target // now the same successor. Depending on the original branch condition diff --git a/contrib/llvm/lib/CodeGen/CountingFunctionInserter.cpp b/contrib/llvm/lib/CodeGen/CountingFunctionInserter.cpp index 1e46a7a..7f7350f 100644 --- a/contrib/llvm/lib/CodeGen/CountingFunctionInserter.cpp +++ b/contrib/llvm/lib/CodeGen/CountingFunctionInserter.cpp @@ -41,7 +41,7 @@ namespace { Type *VoidTy = Type::getVoidTy(F.getContext()); Constant *CountingFn = F.getParent()->getOrInsertFunction(CountingFunctionName, - VoidTy, nullptr); + VoidTy); CallInst::Create(CountingFn, "", &*F.begin()->getFirstInsertionPt()); return true; } diff --git a/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp index 5d60c30..a3cf284 100644 --- a/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp +++ b/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp @@ -71,8 +71,11 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) { // callee-saved register that is not saved in the prolog. const MachineFrameInfo &MFI = MF.getFrameInfo(); BitVector Pristine = MFI.getPristineRegs(MF); - for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) { - if (!IsReturnBlock && !Pristine.test(*I)) continue; + for (const MCPhysReg *I = MF.getRegInfo().getCalleeSavedRegs(); *I; + ++I) { + unsigned Reg = *I; + if (!IsReturnBlock && !Pristine.test(Reg)) + continue; for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) { unsigned Reg = *AI; Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1); @@ -645,10 +648,8 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits, // as well. const SUnit *SU = MISUnitMap[Q->second->getParent()]; if (!SU) continue; - for (DbgValueVector::iterator DVI = DbgValues.begin(), - DVE = DbgValues.end(); DVI != DVE; ++DVI) - if (DVI->second == Q->second->getParent()) - UpdateDbgValue(*DVI->first, AntiDepReg, NewReg); + UpdateDbgValues(DbgValues, Q->second->getParent(), + AntiDepReg, NewReg); } // We just went back in time and modified history; the diff --git a/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp b/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp index 7b1b2d6..853b9af 100644 --- a/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp +++ b/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp @@ -23,49 +23,59 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "packets" - #include "llvm/CodeGen/DFAPacketizer.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBundle.h" +#include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrItineraries.h" -#include "llvm/Target/TargetInstrInfo.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <iterator> +#include <memory> +#include <vector> using namespace llvm; +#define DEBUG_TYPE "packets" + static cl::opt<unsigned> InstrLimit("dfa-instr-limit", cl::Hidden, cl::init(0), cl::desc("If present, stops packetizing after N instructions")); + static unsigned InstrCount = 0; // -------------------------------------------------------------------- // Definitions shared between DFAPacketizer.cpp and DFAPacketizerEmitter.cpp -namespace { - DFAInput addDFAFuncUnits(DFAInput Inp, unsigned FuncUnits) { - return (Inp << DFA_MAX_RESOURCES) | FuncUnits; - } +static DFAInput addDFAFuncUnits(DFAInput Inp, unsigned FuncUnits) { + return (Inp << DFA_MAX_RESOURCES) | FuncUnits; +} - /// Return the DFAInput for an instruction class input vector. - /// This function is used in both DFAPacketizer.cpp and in - /// DFAPacketizerEmitter.cpp. - DFAInput getDFAInsnInput(const std::vector<unsigned> &InsnClass) { - DFAInput InsnInput = 0; - assert((InsnClass.size() <= DFA_MAX_RESTERMS) && - "Exceeded maximum number of DFA terms"); - for (auto U : InsnClass) - InsnInput = addDFAFuncUnits(InsnInput, U); - return InsnInput; - } +/// Return the DFAInput for an instruction class input vector. +/// This function is used in both DFAPacketizer.cpp and in +/// DFAPacketizerEmitter.cpp. +static DFAInput getDFAInsnInput(const std::vector<unsigned> &InsnClass) { + DFAInput InsnInput = 0; + assert((InsnClass.size() <= DFA_MAX_RESTERMS) && + "Exceeded maximum number of DFA terms"); + for (auto U : InsnClass) + InsnInput = addDFAFuncUnits(InsnInput, U); + return InsnInput; } + // -------------------------------------------------------------------- DFAPacketizer::DFAPacketizer(const InstrItineraryData *I, const DFAStateInput (*SIT)[2], const unsigned *SET): - InstrItins(I), CurrentState(0), DFAStateInputTable(SIT), - DFAStateEntryTable(SET) { + InstrItins(I), DFAStateInputTable(SIT), DFAStateEntryTable(SET) { // Make sure DFA types are large enough for the number of terms & resources. static_assert((DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) <= (8 * sizeof(DFAInput)), @@ -75,7 +85,6 @@ DFAPacketizer::DFAPacketizer(const InstrItineraryData *I, "(DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) too big for DFAStateInput"); } - // Read the DFA transition table and update CachedTable. // // Format of the transition tables: @@ -97,7 +106,6 @@ void DFAPacketizer::ReadTable(unsigned int state) { DFAStateInputTable[i][1]; } - // Return the DFAInput for an instruction class. DFAInput DFAPacketizer::getInsnInput(unsigned InsnClass) { // Note: this logic must match that in DFAPacketizerDefs.h for input vectors. @@ -112,16 +120,14 @@ DFAInput DFAPacketizer::getInsnInput(unsigned InsnClass) { return InsnInput; } - // Return the DFAInput for an instruction class input vector. DFAInput DFAPacketizer::getInsnInput(const std::vector<unsigned> &InsnClass) { return getDFAInsnInput(InsnClass); } - // Check if the resources occupied by a MCInstrDesc are available in the // current state. -bool DFAPacketizer::canReserveResources(const llvm::MCInstrDesc *MID) { +bool DFAPacketizer::canReserveResources(const MCInstrDesc *MID) { unsigned InsnClass = MID->getSchedClass(); DFAInput InsnInput = getInsnInput(InsnClass); UnsignPair StateTrans = UnsignPair(CurrentState, InsnInput); @@ -129,10 +135,9 @@ bool DFAPacketizer::canReserveResources(const llvm::MCInstrDesc *MID) { return CachedTable.count(StateTrans) != 0; } - // Reserve the resources occupied by a MCInstrDesc and change the current // state to reflect that change. -void DFAPacketizer::reserveResources(const llvm::MCInstrDesc *MID) { +void DFAPacketizer::reserveResources(const MCInstrDesc *MID) { unsigned InsnClass = MID->getSchedClass(); DFAInput InsnInput = getInsnInput(InsnClass); UnsignPair StateTrans = UnsignPair(CurrentState, InsnInput); @@ -141,24 +146,22 @@ void DFAPacketizer::reserveResources(const llvm::MCInstrDesc *MID) { CurrentState = CachedTable[StateTrans]; } - // Check if the resources occupied by a machine instruction are available // in the current state. -bool DFAPacketizer::canReserveResources(llvm::MachineInstr &MI) { - const llvm::MCInstrDesc &MID = MI.getDesc(); +bool DFAPacketizer::canReserveResources(MachineInstr &MI) { + const MCInstrDesc &MID = MI.getDesc(); return canReserveResources(&MID); } - // Reserve the resources occupied by a machine instruction and change the // current state to reflect that change. -void DFAPacketizer::reserveResources(llvm::MachineInstr &MI) { - const llvm::MCInstrDesc &MID = MI.getDesc(); +void DFAPacketizer::reserveResources(MachineInstr &MI) { + const MCInstrDesc &MID = MI.getDesc(); reserveResources(&MID); } - namespace llvm { + // This class extends ScheduleDAGInstrs and overrides the schedule method // to build the dependence graph. class DefaultVLIWScheduler : public ScheduleDAGInstrs { @@ -166,9 +169,11 @@ private: AliasAnalysis *AA; /// Ordered list of DAG postprocessing steps. std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations; + public: DefaultVLIWScheduler(MachineFunction &MF, MachineLoopInfo &MLI, AliasAnalysis *AA); + // Actual scheduling work. void schedule() override; @@ -176,11 +181,12 @@ public: void addMutation(std::unique_ptr<ScheduleDAGMutation> Mutation) { Mutations.push_back(std::move(Mutation)); } + protected: void postprocessDAG(); }; -} +} // end namespace llvm DefaultVLIWScheduler::DefaultVLIWScheduler(MachineFunction &MF, MachineLoopInfo &MLI, @@ -189,21 +195,18 @@ DefaultVLIWScheduler::DefaultVLIWScheduler(MachineFunction &MF, CanHandleTerminators = true; } - /// Apply each ScheduleDAGMutation step in order. void DefaultVLIWScheduler::postprocessDAG() { for (auto &M : Mutations) M->apply(this); } - void DefaultVLIWScheduler::schedule() { // Build the scheduling graph. buildSchedGraph(AA); postprocessDAG(); } - VLIWPacketizerList::VLIWPacketizerList(MachineFunction &mf, MachineLoopInfo &mli, AliasAnalysis *aa) : MF(mf), TII(mf.getSubtarget().getInstrInfo()), AA(aa) { @@ -211,15 +214,11 @@ VLIWPacketizerList::VLIWPacketizerList(MachineFunction &mf, VLIWScheduler = new DefaultVLIWScheduler(MF, mli, AA); } - VLIWPacketizerList::~VLIWPacketizerList() { - if (VLIWScheduler) - delete VLIWScheduler; - if (ResourceTracker) - delete ResourceTracker; + delete VLIWScheduler; + delete ResourceTracker; } - // End the current packet, bundle packet instructions and reset DFA state. void VLIWPacketizerList::endPacket(MachineBasicBlock *MBB, MachineBasicBlock::iterator MI) { @@ -239,7 +238,6 @@ void VLIWPacketizerList::endPacket(MachineBasicBlock *MBB, DEBUG(dbgs() << "End packet\n"); } - // Bundle machine instructions into packets. void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB, MachineBasicBlock::iterator BeginItr, @@ -338,7 +336,6 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB, VLIWScheduler->finishBlock(); } - // Add a DAG mutation object to the ordered list. void VLIWPacketizerList::addMutation( std::unique_ptr<ScheduleDAGMutation> Mutation) { diff --git a/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp index 17c229a..91d18e2 100644 --- a/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp +++ b/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp @@ -11,10 +11,10 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -23,7 +23,7 @@ using namespace llvm; -#define DEBUG_TYPE "codegen-dce" +#define DEBUG_TYPE "dead-mi-elimination" STATISTIC(NumDeletes, "Number of dead instructions deleted"); @@ -54,7 +54,7 @@ namespace { char DeadMachineInstructionElim::ID = 0; char &llvm::DeadMachineInstructionElimID = DeadMachineInstructionElim::ID; -INITIALIZE_PASS(DeadMachineInstructionElim, "dead-mi-elimination", +INITIALIZE_PASS(DeadMachineInstructionElim, DEBUG_TYPE, "Remove dead machine instructions", false, false) bool DeadMachineInstructionElim::isDead(const MachineInstr *MI) const { @@ -110,7 +110,7 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) { // Start out assuming that reserved registers are live out of this block. LivePhysRegs = MRI->getReservedRegs(); - // Add live-ins from sucessors to LivePhysRegs. Normally, physregs are not + // Add live-ins from successors to LivePhysRegs. Normally, physregs are not // live across blocks, but some targets (x86) can have flags live out of a // block. for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(), diff --git a/contrib/llvm/lib/CodeGen/DetectDeadLanes.cpp b/contrib/llvm/lib/CodeGen/DetectDeadLanes.cpp index a7ba694..ab9a059 100644 --- a/contrib/llvm/lib/CodeGen/DetectDeadLanes.cpp +++ b/contrib/llvm/lib/CodeGen/DetectDeadLanes.cpp @@ -132,8 +132,7 @@ private: char DetectDeadLanes::ID = 0; char &llvm::DetectDeadLanesID = DetectDeadLanes::ID; -INITIALIZE_PASS(DetectDeadLanes, "detect-dead-lanes", "Detect Dead Lanes", - false, false) +INITIALIZE_PASS(DetectDeadLanes, DEBUG_TYPE, "Detect Dead Lanes", false, false) /// Returns true if \p MI will get lowered to a series of COPY instructions. /// We call this a COPY-like instruction. @@ -441,7 +440,7 @@ LaneBitmask DetectDeadLanes::determineInitialUsedLanes(unsigned Reg) { const TargetRegisterClass *DstRC = MRI->getRegClass(DefReg); CrossCopy = isCrossCopy(*MRI, UseMI, DstRC, MO); if (CrossCopy) - DEBUG(dbgs() << "Copy accross incompatible classes: " << UseMI); + DEBUG(dbgs() << "Copy across incompatible classes: " << UseMI); } if (!CrossCopy) diff --git a/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp b/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp index 38af19a..2f83326 100644 --- a/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp +++ b/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp @@ -12,12 +12,13 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" @@ -34,8 +35,6 @@ STATISTIC(NumResumesLowered, "Number of resume calls lowered"); namespace { class DwarfEHPrepare : public FunctionPass { - const TargetMachine *TM; - // RewindFunction - _Unwind_Resume or the target equivalent. Constant *RewindFunction; @@ -52,15 +51,9 @@ namespace { public: static char ID; // Pass identification, replacement for typeid. - // INITIALIZE_TM_PASS requires a default constructor, but it isn't used in - // practice. DwarfEHPrepare() - : FunctionPass(ID), TM(nullptr), RewindFunction(nullptr), DT(nullptr), - TLI(nullptr) {} - - DwarfEHPrepare(const TargetMachine *TM) - : FunctionPass(ID), TM(TM), RewindFunction(nullptr), DT(nullptr), - TLI(nullptr) {} + : FunctionPass(ID), RewindFunction(nullptr), DT(nullptr), TLI(nullptr) { + } bool runOnFunction(Function &Fn) override; @@ -78,18 +71,18 @@ namespace { } // end anonymous namespace char DwarfEHPrepare::ID = 0; -INITIALIZE_TM_PASS_BEGIN(DwarfEHPrepare, "dwarfehprepare", - "Prepare DWARF exceptions", false, false) +INITIALIZE_PASS_BEGIN(DwarfEHPrepare, DEBUG_TYPE, + "Prepare DWARF exceptions", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_TM_PASS_END(DwarfEHPrepare, "dwarfehprepare", - "Prepare DWARF exceptions", false, false) +INITIALIZE_PASS_END(DwarfEHPrepare, DEBUG_TYPE, + "Prepare DWARF exceptions", false, false) -FunctionPass *llvm::createDwarfEHPass(const TargetMachine *TM) { - return new DwarfEHPrepare(TM); -} +FunctionPass *llvm::createDwarfEHPass() { return new DwarfEHPrepare(); } void DwarfEHPrepare::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetPassConfig>(); AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); } @@ -254,9 +247,10 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) { } bool DwarfEHPrepare::runOnFunction(Function &Fn) { - assert(TM && "DWARF EH preparation requires a target machine"); + const TargetMachine &TM = + getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - TLI = TM->getSubtargetImpl(Fn)->getTargetLowering(); + TLI = TM.getSubtargetImpl(Fn)->getTargetLowering(); bool Changed = InsertUnwindResumeCalls(Fn); DT = nullptr; TLI = nullptr; diff --git a/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp b/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp index 7291727..402afe7 100644 --- a/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp +++ b/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp @@ -616,13 +616,13 @@ private: char EarlyIfConverter::ID = 0; char &llvm::EarlyIfConverterID = EarlyIfConverter::ID; -INITIALIZE_PASS_BEGIN(EarlyIfConverter, - "early-ifcvt", "Early If Converter", false, false) +INITIALIZE_PASS_BEGIN(EarlyIfConverter, DEBUG_TYPE, + "Early If Converter", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) -INITIALIZE_PASS_END(EarlyIfConverter, - "early-ifcvt", "Early If Converter", false, false) +INITIALIZE_PASS_END(EarlyIfConverter, DEBUG_TYPE, + "Early If Converter", false, false) void EarlyIfConverter::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<MachineBranchProbabilityInfo>(); diff --git a/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp b/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp index 32c57e3..e272d25 100644 --- a/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp +++ b/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp @@ -6,21 +6,9 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// This file contains the execution dependency fix pass. -// -// Some X86 SSE instructions like mov, and, or, xor are available in different -// variants for different operand types. These variant instructions are -// equivalent, but on Nehalem and newer cpus there is extra latency -// transferring data between integer and floating point domains. ARM cores -// have similar issues when they are configured with both VFP and NEON -// pipelines. -// -// This pass changes the variant instructions to minimize domain crossings. -// -//===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/ExecutionDepsFix.h" + #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/iterator_range.h" #include "llvm/CodeGen/LivePhysRegs.h" @@ -35,193 +23,18 @@ using namespace llvm; -#define DEBUG_TYPE "execution-fix" - -/// A DomainValue is a bit like LiveIntervals' ValNo, but it also keeps track -/// of execution domains. -/// -/// An open DomainValue represents a set of instructions that can still switch -/// execution domain. Multiple registers may refer to the same open -/// DomainValue - they will eventually be collapsed to the same execution -/// domain. -/// -/// A collapsed DomainValue represents a single register that has been forced -/// into one of more execution domains. There is a separate collapsed -/// DomainValue for each register, but it may contain multiple execution -/// domains. A register value is initially created in a single execution -/// domain, but if we were forced to pay the penalty of a domain crossing, we -/// keep track of the fact that the register is now available in multiple -/// domains. -namespace { -struct DomainValue { - // Basic reference counting. - unsigned Refs; - - // Bitmask of available domains. For an open DomainValue, it is the still - // possible domains for collapsing. For a collapsed DomainValue it is the - // domains where the register is available for free. - unsigned AvailableDomains; - - // Pointer to the next DomainValue in a chain. When two DomainValues are - // merged, Victim.Next is set to point to Victor, so old DomainValue - // references can be updated by following the chain. - DomainValue *Next; - - // Twiddleable instructions using or defining these registers. - SmallVector<MachineInstr*, 8> Instrs; - - // A collapsed DomainValue has no instructions to twiddle - it simply keeps - // track of the domains where the registers are already available. - bool isCollapsed() const { return Instrs.empty(); } - - // Is domain available? - bool hasDomain(unsigned domain) const { - assert(domain < - static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && - "undefined behavior"); - return AvailableDomains & (1u << domain); - } - - // Mark domain as available. - void addDomain(unsigned domain) { - AvailableDomains |= 1u << domain; - } - - // Restrict to a single domain available. - void setSingleDomain(unsigned domain) { - AvailableDomains = 1u << domain; - } - - // Return bitmask of domains that are available and in mask. - unsigned getCommonDomains(unsigned mask) const { - return AvailableDomains & mask; - } - - // First domain available. - unsigned getFirstDomain() const { - return countTrailingZeros(AvailableDomains); - } - - DomainValue() : Refs(0) { clear(); } - - // Clear this DomainValue and point to next which has all its data. - void clear() { - AvailableDomains = 0; - Next = nullptr; - Instrs.clear(); - } -}; -} - -namespace { -/// Information about a live register. -struct LiveReg { - /// Value currently in this register, or NULL when no value is being tracked. - /// This counts as a DomainValue reference. - DomainValue *Value; - - /// Instruction that defined this register, relative to the beginning of the - /// current basic block. When a LiveReg is used to represent a live-out - /// register, this value is relative to the end of the basic block, so it - /// will be a negative number. - int Def; -}; -} // anonymous namespace - -namespace { -class ExeDepsFix : public MachineFunctionPass { - static char ID; - SpecificBumpPtrAllocator<DomainValue> Allocator; - SmallVector<DomainValue*,16> Avail; - - const TargetRegisterClass *const RC; - MachineFunction *MF; - const TargetInstrInfo *TII; - const TargetRegisterInfo *TRI; - RegisterClassInfo RegClassInfo; - std::vector<SmallVector<int, 1>> AliasMap; - const unsigned NumRegs; - LiveReg *LiveRegs; - typedef DenseMap<MachineBasicBlock*, LiveReg*> LiveOutMap; - LiveOutMap LiveOuts; - - /// List of undefined register reads in this block in forward order. - std::vector<std::pair<MachineInstr*, unsigned> > UndefReads; - - /// Storage for register unit liveness. - LivePhysRegs LiveRegSet; - - /// Current instruction number. - /// The first instruction in each basic block is 0. - int CurInstr; - - /// True when the current block has a predecessor that hasn't been visited - /// yet. - bool SeenUnknownBackEdge; - -public: - ExeDepsFix(const TargetRegisterClass *rc) - : MachineFunctionPass(ID), RC(rc), NumRegs(RC->getNumRegs()) {} - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - MachineFunctionProperties getRequiredProperties() const override { - return MachineFunctionProperties().set( - MachineFunctionProperties::Property::NoVRegs); - } - - StringRef getPassName() const override { return "Execution dependency fix"; } - -private: - iterator_range<SmallVectorImpl<int>::const_iterator> - regIndices(unsigned Reg) const; - - // DomainValue allocation. - DomainValue *alloc(int domain = -1); - DomainValue *retain(DomainValue *DV) { - if (DV) ++DV->Refs; - return DV; - } - void release(DomainValue*); - DomainValue *resolve(DomainValue*&); - - // LiveRegs manipulations. - void setLiveReg(int rx, DomainValue *DV); - void kill(int rx); - void force(int rx, unsigned domain); - void collapse(DomainValue *dv, unsigned domain); - bool merge(DomainValue *A, DomainValue *B); - - void enterBasicBlock(MachineBasicBlock*); - void leaveBasicBlock(MachineBasicBlock*); - void visitInstr(MachineInstr*); - void processDefs(MachineInstr*, bool Kill); - void visitSoftInstr(MachineInstr*, unsigned mask); - void visitHardInstr(MachineInstr*, unsigned domain); - void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, - unsigned Pref); - bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref); - void processUndefReads(MachineBasicBlock*); -}; -} - -char ExeDepsFix::ID = 0; +#define DEBUG_TYPE "execution-deps-fix" /// Translate TRI register number to a list of indices into our smaller tables /// of interesting registers. iterator_range<SmallVectorImpl<int>::const_iterator> -ExeDepsFix::regIndices(unsigned Reg) const { +ExecutionDepsFix::regIndices(unsigned Reg) const { assert(Reg < AliasMap.size() && "Invalid register"); const auto &Entry = AliasMap[Reg]; return make_range(Entry.begin(), Entry.end()); } -DomainValue *ExeDepsFix::alloc(int domain) { +DomainValue *ExecutionDepsFix::alloc(int domain) { DomainValue *dv = Avail.empty() ? new(Allocator.Allocate()) DomainValue : Avail.pop_back_val(); @@ -234,7 +47,7 @@ DomainValue *ExeDepsFix::alloc(int domain) { /// Release a reference to DV. When the last reference is released, /// collapse if needed. -void ExeDepsFix::release(DomainValue *DV) { +void ExecutionDepsFix::release(DomainValue *DV) { while (DV) { assert(DV->Refs && "Bad DomainValue"); if (--DV->Refs) @@ -254,7 +67,7 @@ void ExeDepsFix::release(DomainValue *DV) { /// Follow the chain of dead DomainValues until a live DomainValue is reached. /// Update the referenced pointer when necessary. -DomainValue *ExeDepsFix::resolve(DomainValue *&DVRef) { +DomainValue *ExecutionDepsFix::resolve(DomainValue *&DVRef) { DomainValue *DV = DVRef; if (!DV || !DV->Next) return DV; @@ -271,7 +84,7 @@ DomainValue *ExeDepsFix::resolve(DomainValue *&DVRef) { } /// Set LiveRegs[rx] = dv, updating reference counts. -void ExeDepsFix::setLiveReg(int rx, DomainValue *dv) { +void ExecutionDepsFix::setLiveReg(int rx, DomainValue *dv) { assert(unsigned(rx) < NumRegs && "Invalid index"); assert(LiveRegs && "Must enter basic block first."); @@ -283,7 +96,7 @@ void ExeDepsFix::setLiveReg(int rx, DomainValue *dv) { } // Kill register rx, recycle or collapse any DomainValue. -void ExeDepsFix::kill(int rx) { +void ExecutionDepsFix::kill(int rx) { assert(unsigned(rx) < NumRegs && "Invalid index"); assert(LiveRegs && "Must enter basic block first."); if (!LiveRegs[rx].Value) @@ -294,7 +107,7 @@ void ExeDepsFix::kill(int rx) { } /// Force register rx into domain. -void ExeDepsFix::force(int rx, unsigned domain) { +void ExecutionDepsFix::force(int rx, unsigned domain) { assert(unsigned(rx) < NumRegs && "Invalid index"); assert(LiveRegs && "Must enter basic block first."); if (DomainValue *dv = LiveRegs[rx].Value) { @@ -317,7 +130,7 @@ void ExeDepsFix::force(int rx, unsigned domain) { /// Collapse open DomainValue into given domain. If there are multiple /// registers using dv, they each get a unique collapsed DomainValue. -void ExeDepsFix::collapse(DomainValue *dv, unsigned domain) { +void ExecutionDepsFix::collapse(DomainValue *dv, unsigned domain) { assert(dv->hasDomain(domain) && "Cannot collapse"); // Collapse all the instructions. @@ -333,7 +146,7 @@ void ExeDepsFix::collapse(DomainValue *dv, unsigned domain) { } /// All instructions and registers in B are moved to A, and B is released. -bool ExeDepsFix::merge(DomainValue *A, DomainValue *B) { +bool ExecutionDepsFix::merge(DomainValue *A, DomainValue *B) { assert(!A->isCollapsed() && "Cannot merge into collapsed"); assert(!B->isCollapsed() && "Cannot merge from collapsed"); if (A == B) @@ -359,10 +172,7 @@ bool ExeDepsFix::merge(DomainValue *A, DomainValue *B) { } /// Set up LiveRegs by merging predecessor live-out values. -void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) { - // Detect back-edges from predecessors we haven't processed yet. - SeenUnknownBackEdge = false; - +void ExecutionDepsFix::enterBasicBlock(MachineBasicBlock *MBB) { // Reset instruction counter in each basic block. CurInstr = 0; @@ -397,18 +207,21 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) { // Try to coalesce live-out registers from predecessors. for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(), pe = MBB->pred_end(); pi != pe; ++pi) { - LiveOutMap::const_iterator fi = LiveOuts.find(*pi); - if (fi == LiveOuts.end()) { - SeenUnknownBackEdge = true; + auto fi = MBBInfos.find(*pi); + assert(fi != MBBInfos.end() && + "Should have pre-allocated MBBInfos for all MBBs"); + LiveReg *Incoming = fi->second.OutRegs; + // Incoming is null if this is a backedge from a BB + // we haven't processed yet + if (Incoming == nullptr) { continue; } - assert(fi->second && "Can't have NULL entries"); for (unsigned rx = 0; rx != NumRegs; ++rx) { // Use the most recent predecessor def for each register. - LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, fi->second[rx].Def); + LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, Incoming[rx].Def); - DomainValue *pdv = resolve(fi->second[rx].Value); + DomainValue *pdv = resolve(Incoming[rx].Value); if (!pdv) continue; if (!LiveRegs[rx].Value) { @@ -432,35 +245,34 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) { force(rx, pdv->getFirstDomain()); } } - DEBUG(dbgs() << "BB#" << MBB->getNumber() - << (SeenUnknownBackEdge ? ": incomplete\n" : ": all preds known\n")); + DEBUG( + dbgs() << "BB#" << MBB->getNumber() + << (!isBlockDone(MBB) ? ": incomplete\n" : ": all preds known\n")); } -void ExeDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) { +void ExecutionDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) { assert(LiveRegs && "Must enter basic block first."); - // Save live registers at end of MBB - used by enterBasicBlock(). - // Also use LiveOuts as a visited set to detect back-edges. - bool First = LiveOuts.insert(std::make_pair(MBB, LiveRegs)).second; - - if (First) { - // LiveRegs was inserted in LiveOuts. Adjust all defs to be relative to - // the end of this block instead of the beginning. - for (unsigned i = 0, e = NumRegs; i != e; ++i) - LiveRegs[i].Def -= CurInstr; - } else { - // Insertion failed, this must be the second pass. + LiveReg *OldOutRegs = MBBInfos[MBB].OutRegs; + // Save register clearances at end of MBB - used by enterBasicBlock(). + MBBInfos[MBB].OutRegs = LiveRegs; + + // While processing the basic block, we kept `Def` relative to the start + // of the basic block for convenience. However, future use of this information + // only cares about the clearance from the end of the block, so adjust + // everything to be relative to the end of the basic block. + for (unsigned i = 0, e = NumRegs; i != e; ++i) + LiveRegs[i].Def -= CurInstr; + if (OldOutRegs) { + // This must be the second pass. // Release all the DomainValues instead of keeping them. for (unsigned i = 0, e = NumRegs; i != e; ++i) - release(LiveRegs[i].Value); - delete[] LiveRegs; + release(OldOutRegs[i].Value); + delete[] OldOutRegs; } LiveRegs = nullptr; } -void ExeDepsFix::visitInstr(MachineInstr *MI) { - if (MI->isDebugValue()) - return; - +bool ExecutionDepsFix::visitInstr(MachineInstr *MI) { // Update instructions with explicit execution domains. std::pair<uint16_t, uint16_t> DomP = TII->getExecutionDomain(*MI); if (DomP.first) { @@ -470,16 +282,16 @@ void ExeDepsFix::visitInstr(MachineInstr *MI) { visitHardInstr(MI, DomP.first); } - // Process defs to track register ages, and kill values clobbered by generic - // instructions. - processDefs(MI, !DomP.first); + return !DomP.first; } /// \brief Helps avoid false dependencies on undef registers by updating the /// machine instructions' undef operand to use a register that the instruction /// is truly dependent on, or use a register with clearance higher than Pref. -void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, - unsigned Pref) { +/// Returns true if it was able to find a true dependency, thus not requiring +/// a dependency breaking instruction regardless of clearance. +bool ExecutionDepsFix::pickBestRegisterForUndef(MachineInstr *MI, + unsigned OpIdx, unsigned Pref) { MachineOperand &MO = MI->getOperand(OpIdx); assert(MO.isUndef() && "Expected undef machine operand"); @@ -487,7 +299,7 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, // Update only undef operands that are mapped to one register. if (AliasMap[OriginalReg].size() != 1) - return; + return false; // Get the undef operand's register class const TargetRegisterClass *OpRC = @@ -502,7 +314,7 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, // We found a true dependency - replace the undef register with the true // dependency. MO.setReg(CurrMO.getReg()); - return; + return true; } // Go over all registers in the register class and find the register with @@ -527,12 +339,14 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, // Update the operand if we found a register with better clearance. if (MaxClearanceReg != OriginalReg) MO.setReg(MaxClearanceReg); + + return false; } /// \brief Return true to if it makes sense to break dependence on a partial def /// or undef use. -bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, - unsigned Pref) { +bool ExecutionDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, + unsigned Pref) { unsigned reg = MI->getOperand(OpIdx).getReg(); for (int rx : regIndices(reg)) { unsigned Clearance = CurInstr - LiveRegs[rx].Def; @@ -542,14 +356,7 @@ bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, DEBUG(dbgs() << ": Break dependency.\n"); continue; } - // The current clearance seems OK, but we may be ignoring a def from a - // back-edge. - if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) { - DEBUG(dbgs() << ": OK .\n"); - return false; - } - // A def from an unprocessed back-edge may make us break this dependency. - DEBUG(dbgs() << ": Wait for back-edge to resolve.\n"); + DEBUG(dbgs() << ": OK .\n"); return false; } return true; @@ -559,16 +366,22 @@ bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, // If Kill is set, also kill off DomainValues clobbered by the defs. // // Also break dependencies on partial defs and undef uses. -void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) { +void ExecutionDepsFix::processDefs(MachineInstr *MI, bool breakDependency, + bool Kill) { assert(!MI->isDebugValue() && "Won't process debug values"); // Break dependence on undef uses. Do this before updating LiveRegs below. unsigned OpNum; - unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI); - if (Pref) { - pickBestRegisterForUndef(MI, OpNum, Pref); - if (shouldBreakDependence(MI, OpNum, Pref)) - UndefReads.push_back(std::make_pair(MI, OpNum)); + if (breakDependency) { + unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI); + if (Pref) { + bool HadTrueDependency = pickBestRegisterForUndef(MI, OpNum, Pref); + // We don't need to bother trying to break a dependency if this + // instruction has a true dependency on that register through another + // operand - we'll have to wait for it to be available regardless. + if (!HadTrueDependency && shouldBreakDependence(MI, OpNum, Pref)) + UndefReads.push_back(std::make_pair(MI, OpNum)); + } } const MCInstrDesc &MCID = MI->getDesc(); for (unsigned i = 0, @@ -584,11 +397,13 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) { DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr << '\t' << *MI); - // Check clearance before partial register updates. - // Call breakDependence before setting LiveRegs[rx].Def. - unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI); - if (Pref && shouldBreakDependence(MI, i, Pref)) - TII->breakPartialRegDependency(*MI, i, TRI); + if (breakDependency) { + // Check clearance before partial register updates. + // Call breakDependence before setting LiveRegs[rx].Def. + unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI); + if (Pref && shouldBreakDependence(MI, i, Pref)) + TII->breakPartialRegDependency(*MI, i, TRI); + } // How many instructions since rx was last written? LiveRegs[rx].Def = CurInstr; @@ -607,7 +422,7 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) { /// only do it on demand. Note that the occurrence of undefined register reads /// that should be broken is very rare, but when they occur we may have many in /// a single block. -void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) { +void ExecutionDepsFix::processUndefReads(MachineBasicBlock *MBB) { if (UndefReads.empty()) return; @@ -640,7 +455,7 @@ void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) { // A hard instruction only works in one domain. All input registers will be // forced into that domain. -void ExeDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) { +void ExecutionDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) { // Collapse all uses. for (unsigned i = mi->getDesc().getNumDefs(), e = mi->getDesc().getNumOperands(); i != e; ++i) { @@ -663,7 +478,7 @@ void ExeDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) { } // A soft instruction can be changed to work in other domains given by mask. -void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) { +void ExecutionDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) { // Bitmask of available domains for this instruction after taking collapsed // operands into account. unsigned available = mask; @@ -774,7 +589,34 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) { } } -bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) { +void ExecutionDepsFix::processBasicBlock(MachineBasicBlock *MBB, + bool PrimaryPass) { + enterBasicBlock(MBB); + // If this block is not done, it makes little sense to make any decisions + // based on clearance information. We need to make a second pass anyway, + // and by then we'll have better information, so we can avoid doing the work + // to try and break dependencies now. + bool breakDependency = isBlockDone(MBB); + for (MachineInstr &MI : *MBB) { + if (!MI.isDebugValue()) { + bool Kill = false; + if (PrimaryPass) + Kill = visitInstr(&MI); + processDefs(&MI, breakDependency, Kill); + } + } + if (breakDependency) + processUndefReads(MBB); + leaveBasicBlock(MBB); +} + +bool ExecutionDepsFix::isBlockDone(MachineBasicBlock *MBB) { + return MBBInfos[MBB].PrimaryCompleted && + MBBInfos[MBB].IncomingCompleted == MBBInfos[MBB].PrimaryIncoming && + MBBInfos[MBB].IncomingProcessed == MBB->pred_size(); +} + +bool ExecutionDepsFix::runOnMachineFunction(MachineFunction &mf) { if (skipFunction(*mf.getFunction())) return false; MF = &mf; @@ -810,52 +652,104 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) { AliasMap[*AI].push_back(i); } + // Initialize the MMBInfos + for (auto &MBB : mf) { + MBBInfo InitialInfo; + MBBInfos.insert(std::make_pair(&MBB, InitialInfo)); + } + + /* + * We want to visit every instruction in every basic block in order to update + * it's execution domain or break any false dependencies. However, for the + * dependency breaking, we need to know clearances from all predecessors + * (including any backedges). One way to do so would be to do two complete + * passes over all basic blocks/instructions, the first for recording + * clearances, the second to break the dependencies. However, for functions + * without backedges, or functions with a lot of straight-line code, and + * a small loop, that would be a lot of unnecessary work (since only the + * BBs that are part of the loop require two passes). As an example, + * consider the following loop. + * + * + * PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT + * ^ | + * +----------------------------------+ + * + * The iteration order is as follows: + * Naive: PH A B C D A' B' C' D' + * Optimized: PH A B C A' B' C' D + * + * Note that we avoid processing D twice, because we can entirely process + * the predecessors before getting to D. We call a block that is ready + * for its second round of processing `done` (isBlockDone). Once we finish + * processing some block, we update the counters in MBBInfos and re-process + * any successors that are now done. + */ + MachineBasicBlock *Entry = &*MF->begin(); ReversePostOrderTraversal<MachineBasicBlock*> RPOT(Entry); - SmallVector<MachineBasicBlock*, 16> Loops; + SmallVector<MachineBasicBlock *, 4> Workqueue; for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) { MachineBasicBlock *MBB = *MBBI; - enterBasicBlock(MBB); - if (SeenUnknownBackEdge) - Loops.push_back(MBB); - for (MachineInstr &MI : *MBB) - visitInstr(&MI); - processUndefReads(MBB); - leaveBasicBlock(MBB); + // N.B: IncomingProcessed and IncomingCompleted were already updated while + // processing this block's predecessors. + MBBInfos[MBB].PrimaryCompleted = true; + MBBInfos[MBB].PrimaryIncoming = MBBInfos[MBB].IncomingProcessed; + bool Primary = true; + Workqueue.push_back(MBB); + while (!Workqueue.empty()) { + MachineBasicBlock *ActiveMBB = &*Workqueue.back(); + Workqueue.pop_back(); + processBasicBlock(ActiveMBB, Primary); + bool Done = isBlockDone(ActiveMBB); + for (auto *Succ : ActiveMBB->successors()) { + if (!isBlockDone(Succ)) { + if (Primary) { + MBBInfos[Succ].IncomingProcessed++; + } + if (Done) { + MBBInfos[Succ].IncomingCompleted++; + } + if (isBlockDone(Succ)) { + Workqueue.push_back(Succ); + } + } + } + Primary = false; + } } - // Visit all the loop blocks again in order to merge DomainValues from - // back-edges. - for (MachineBasicBlock *MBB : Loops) { - enterBasicBlock(MBB); - for (MachineInstr &MI : *MBB) - if (!MI.isDebugValue()) - processDefs(&MI, false); - processUndefReads(MBB); - leaveBasicBlock(MBB); + // We need to go through again and finalize any blocks that are not done yet. + // This is possible if blocks have dead predecessors, so we didn't visit them + // above. + for (ReversePostOrderTraversal<MachineBasicBlock *>::rpo_iterator + MBBI = RPOT.begin(), + MBBE = RPOT.end(); + MBBI != MBBE; ++MBBI) { + MachineBasicBlock *MBB = *MBBI; + if (!isBlockDone(MBB)) { + processBasicBlock(MBB, false); + // Don't update successors here. We'll get to them anyway through this + // loop. + } } // Clear the LiveOuts vectors and collapse any remaining DomainValues. for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) { - LiveOutMap::const_iterator FI = LiveOuts.find(*MBBI); - if (FI == LiveOuts.end() || !FI->second) + auto FI = MBBInfos.find(*MBBI); + if (FI == MBBInfos.end() || !FI->second.OutRegs) continue; for (unsigned i = 0, e = NumRegs; i != e; ++i) - if (FI->second[i].Value) - release(FI->second[i].Value); - delete[] FI->second; + if (FI->second.OutRegs[i].Value) + release(FI->second.OutRegs[i].Value); + delete[] FI->second.OutRegs; } - LiveOuts.clear(); + MBBInfos.clear(); UndefReads.clear(); Avail.clear(); Allocator.DestroyAll(); return false; } - -FunctionPass * -llvm::createExecutionDependencyFixPass(const TargetRegisterClass *RC) { - return new ExeDepsFix(RC); -} diff --git a/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp b/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp index 0ec79c2..324ea17 100644 --- a/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp +++ b/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp @@ -14,9 +14,9 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/Support/Debug.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetSubtargetInfo.h" @@ -41,7 +41,7 @@ namespace { char ExpandISelPseudos::ID = 0; char &llvm::ExpandISelPseudosID = ExpandISelPseudos::ID; -INITIALIZE_PASS(ExpandISelPseudos, "expand-isel-pseudos", +INITIALIZE_PASS(ExpandISelPseudos, DEBUG_TYPE, "Expand ISel Pseudo-instructions", false, false) bool ExpandISelPseudos::runOnMachineFunction(MachineFunction &MF) { diff --git a/contrib/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp b/contrib/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp index ab2382e..4ce86f2 100644 --- a/contrib/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp +++ b/contrib/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp @@ -12,11 +12,11 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" @@ -58,7 +58,7 @@ private: char ExpandPostRA::ID = 0; char &llvm::ExpandPostRAPseudosID = ExpandPostRA::ID; -INITIALIZE_PASS(ExpandPostRA, "postrapseudos", +INITIALIZE_PASS(ExpandPostRA, DEBUG_TYPE, "Post-RA pseudo instruction expansion pass", false, false) /// TransferImplicitOperands - MI is a pseudo-instruction, and the lowered @@ -142,8 +142,9 @@ bool ExpandPostRA::LowerCopy(MachineInstr *MI) { MachineOperand &DstMO = MI->getOperand(0); MachineOperand &SrcMO = MI->getOperand(1); - if (SrcMO.getReg() == DstMO.getReg()) { - DEBUG(dbgs() << "identity copy: " << *MI); + bool IdentityCopy = (SrcMO.getReg() == DstMO.getReg()); + if (IdentityCopy || SrcMO.isUndef()) { + DEBUG(dbgs() << (IdentityCopy ? "identity copy: " : "undef copy: ") << *MI); // No need to insert an identity copy instruction, but replace with a KILL // if liveness is changed. if (SrcMO.isUndef() || MI->getNumOperands() > 2) { diff --git a/contrib/llvm/lib/CodeGen/ExpandReductions.cpp b/contrib/llvm/lib/CodeGen/ExpandReductions.cpp new file mode 100644 index 0000000..70dca3b --- /dev/null +++ b/contrib/llvm/lib/CodeGen/ExpandReductions.cpp @@ -0,0 +1,167 @@ +//===--- ExpandReductions.cpp - Expand experimental reduction intrinsics --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass implements IR expansion for reduction intrinsics, allowing targets +// to enable the experimental intrinsics until just before codegen. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/ExpandReductions.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Utils/LoopUtils.h" + +using namespace llvm; + +namespace { + +unsigned getOpcode(Intrinsic::ID ID) { + switch (ID) { + case Intrinsic::experimental_vector_reduce_fadd: + return Instruction::FAdd; + case Intrinsic::experimental_vector_reduce_fmul: + return Instruction::FMul; + case Intrinsic::experimental_vector_reduce_add: + return Instruction::Add; + case Intrinsic::experimental_vector_reduce_mul: + return Instruction::Mul; + case Intrinsic::experimental_vector_reduce_and: + return Instruction::And; + case Intrinsic::experimental_vector_reduce_or: + return Instruction::Or; + case Intrinsic::experimental_vector_reduce_xor: + return Instruction::Xor; + case Intrinsic::experimental_vector_reduce_smax: + case Intrinsic::experimental_vector_reduce_smin: + case Intrinsic::experimental_vector_reduce_umax: + case Intrinsic::experimental_vector_reduce_umin: + return Instruction::ICmp; + case Intrinsic::experimental_vector_reduce_fmax: + case Intrinsic::experimental_vector_reduce_fmin: + return Instruction::FCmp; + default: + llvm_unreachable("Unexpected ID"); + } +} + +RecurrenceDescriptor::MinMaxRecurrenceKind getMRK(Intrinsic::ID ID) { + switch (ID) { + case Intrinsic::experimental_vector_reduce_smax: + return RecurrenceDescriptor::MRK_SIntMax; + case Intrinsic::experimental_vector_reduce_smin: + return RecurrenceDescriptor::MRK_SIntMin; + case Intrinsic::experimental_vector_reduce_umax: + return RecurrenceDescriptor::MRK_UIntMax; + case Intrinsic::experimental_vector_reduce_umin: + return RecurrenceDescriptor::MRK_UIntMin; + case Intrinsic::experimental_vector_reduce_fmax: + return RecurrenceDescriptor::MRK_FloatMax; + case Intrinsic::experimental_vector_reduce_fmin: + return RecurrenceDescriptor::MRK_FloatMin; + default: + return RecurrenceDescriptor::MRK_Invalid; + } +} + +bool expandReductions(Function &F, const TargetTransformInfo *TTI) { + bool Changed = false; + SmallVector<IntrinsicInst*, 4> Worklist; + for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) + if (auto II = dyn_cast<IntrinsicInst>(&*I)) + Worklist.push_back(II); + + for (auto *II : Worklist) { + IRBuilder<> Builder(II); + Value *Vec = nullptr; + auto ID = II->getIntrinsicID(); + auto MRK = RecurrenceDescriptor::MRK_Invalid; + switch (ID) { + case Intrinsic::experimental_vector_reduce_fadd: + case Intrinsic::experimental_vector_reduce_fmul: + // FMFs must be attached to the call, otherwise it's an ordered reduction + // and it can't be handled by generating this shuffle sequence. + // TODO: Implement scalarization of ordered reductions here for targets + // without native support. + if (!II->getFastMathFlags().unsafeAlgebra()) + continue; + Vec = II->getArgOperand(1); + break; + case Intrinsic::experimental_vector_reduce_add: + case Intrinsic::experimental_vector_reduce_mul: + case Intrinsic::experimental_vector_reduce_and: + case Intrinsic::experimental_vector_reduce_or: + case Intrinsic::experimental_vector_reduce_xor: + case Intrinsic::experimental_vector_reduce_smax: + case Intrinsic::experimental_vector_reduce_smin: + case Intrinsic::experimental_vector_reduce_umax: + case Intrinsic::experimental_vector_reduce_umin: + case Intrinsic::experimental_vector_reduce_fmax: + case Intrinsic::experimental_vector_reduce_fmin: + Vec = II->getArgOperand(0); + MRK = getMRK(ID); + break; + default: + continue; + } + if (!TTI->shouldExpandReduction(II)) + continue; + auto Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK); + II->replaceAllUsesWith(Rdx); + II->eraseFromParent(); + Changed = true; + } + return Changed; +} + +class ExpandReductions : public FunctionPass { +public: + static char ID; + ExpandReductions() : FunctionPass(ID) { + initializeExpandReductionsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + const auto *TTI =&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + return expandReductions(F, TTI); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.setPreservesCFG(); + } +}; +} + +char ExpandReductions::ID; +INITIALIZE_PASS_BEGIN(ExpandReductions, "expand-reductions", + "Expand reduction intrinsics", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(ExpandReductions, "expand-reductions", + "Expand reduction intrinsics", false, false) + +FunctionPass *llvm::createExpandReductionsPass() { + return new ExpandReductions(); +} + +PreservedAnalyses ExpandReductionsPass::run(Function &F, + FunctionAnalysisManager &AM) { + const auto &TTI = AM.getResult<TargetIRAnalysis>(F); + if (!expandReductions(F, &TTI)) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); + return PA; +} diff --git a/contrib/llvm/lib/CodeGen/FEntryInserter.cpp b/contrib/llvm/lib/CodeGen/FEntryInserter.cpp new file mode 100644 index 0000000..0759bf6 --- /dev/null +++ b/contrib/llvm/lib/CodeGen/FEntryInserter.cpp @@ -0,0 +1,55 @@ +//===-- FEntryInsertion.cpp - Patchable prologues for LLVM -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file edits function bodies to insert fentry calls. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +using namespace llvm; + +namespace { +struct FEntryInserter : public MachineFunctionPass { + static char ID; // Pass identification, replacement for typeid + FEntryInserter() : MachineFunctionPass(ID) { + initializeFEntryInserterPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &F) override; +}; +} + +bool FEntryInserter::runOnMachineFunction(MachineFunction &MF) { + const std::string FEntryName = + MF.getFunction()->getFnAttribute("fentry-call").getValueAsString(); + if (FEntryName != "true") + return false; + + auto &FirstMBB = *MF.begin(); + auto &FirstMI = *FirstMBB.begin(); + + auto *TII = MF.getSubtarget().getInstrInfo(); + BuildMI(FirstMBB, FirstMI, FirstMI.getDebugLoc(), + TII->get(TargetOpcode::FENTRY_CALL)); + return true; +} + +char FEntryInserter::ID = 0; +char &llvm::FEntryInserterID = FEntryInserter::ID; +INITIALIZE_PASS(FEntryInserter, "fentry-insert", "Insert fentry calls", false, + false) diff --git a/contrib/llvm/lib/CodeGen/FaultMaps.cpp b/contrib/llvm/lib/CodeGen/FaultMaps.cpp index 2acafaf..2924b01 100644 --- a/contrib/llvm/lib/CodeGen/FaultMaps.cpp +++ b/contrib/llvm/lib/CodeGen/FaultMaps.cpp @@ -1,4 +1,4 @@ -//===---------------------------- FaultMaps.cpp ---------------------------===// +//===- FaultMaps.cpp ------------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -8,13 +8,16 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/FaultMaps.h" - +#include "llvm/ADT/Twine.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -102,14 +105,16 @@ void FaultMaps::emitFunctionInfo(const MCSymbol *FnLabel, } } - const char *FaultMaps::faultTypeToString(FaultMaps::FaultKind FT) { switch (FT) { default: llvm_unreachable("unhandled fault type!"); - case FaultMaps::FaultingLoad: return "FaultingLoad"; + case FaultMaps::FaultingLoadStore: + return "FaultingLoadStore"; + case FaultMaps::FaultingStore: + return "FaultingStore"; } } diff --git a/contrib/llvm/lib/CodeGen/FuncletLayout.cpp b/contrib/llvm/lib/CodeGen/FuncletLayout.cpp index d61afad..9c71b18 100644 --- a/contrib/llvm/lib/CodeGen/FuncletLayout.cpp +++ b/contrib/llvm/lib/CodeGen/FuncletLayout.cpp @@ -11,10 +11,10 @@ // funclets being contiguous. // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/Passes.h" using namespace llvm; #define DEBUG_TYPE "funclet-layout" @@ -37,7 +37,7 @@ public: char FuncletLayout::ID = 0; char &llvm::FuncletLayoutID = FuncletLayout::ID; -INITIALIZE_PASS(FuncletLayout, "funclet-layout", +INITIALIZE_PASS(FuncletLayout, DEBUG_TYPE, "Contiguously Lay Out Funclets", false, false) bool FuncletLayout::runOnMachineFunction(MachineFunction &F) { diff --git a/contrib/llvm/lib/CodeGen/GCMetadata.cpp b/contrib/llvm/lib/CodeGen/GCMetadata.cpp index be21c73..456fa79 100644 --- a/contrib/llvm/lib/CodeGen/GCMetadata.cpp +++ b/contrib/llvm/lib/CodeGen/GCMetadata.cpp @@ -11,22 +11,27 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/GCMetadata.h" #include "llvm/CodeGen/GCStrategy.h" -#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Function.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Pass.h" -#include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <cassert> +#include <memory> +#include <string> + using namespace llvm; namespace { class Printer : public FunctionPass { static char ID; + raw_ostream &OS; public: @@ -38,7 +43,8 @@ public: bool runOnFunction(Function &F) override; bool doFinalization(Module &M) override; }; -} + +} // end anonymous namespace INITIALIZE_PASS(GCModuleInfo, "collector-metadata", "Create Garbage Collector Module Metadata", false, false) @@ -48,7 +54,7 @@ INITIALIZE_PASS(GCModuleInfo, "collector-metadata", GCFunctionInfo::GCFunctionInfo(const Function &F, GCStrategy &S) : F(F), S(S), FrameSize(~0LL) {} -GCFunctionInfo::~GCFunctionInfo() {} +GCFunctionInfo::~GCFunctionInfo() = default; // ----------------------------------------------------------------------------- @@ -67,7 +73,7 @@ GCFunctionInfo &GCModuleInfo::getFunctionInfo(const Function &F) { return *I->second; GCStrategy *S = getGCStrategy(F.getGC()); - Functions.push_back(make_unique<GCFunctionInfo>(F, *S)); + Functions.push_back(llvm::make_unique<GCFunctionInfo>(F, *S)); GCFunctionInfo *GFI = Functions.back().get(); FInfoMap[&F] = GFI; return *GFI; diff --git a/contrib/llvm/lib/CodeGen/GCMetadataPrinter.cpp b/contrib/llvm/lib/CodeGen/GCMetadataPrinter.cpp index d183c7f..bc7beb6 100644 --- a/contrib/llvm/lib/CodeGen/GCMetadataPrinter.cpp +++ b/contrib/llvm/lib/CodeGen/GCMetadataPrinter.cpp @@ -1,4 +1,4 @@ -//===-- GCMetadataPrinter.cpp - Garbage collection infrastructure ---------===// +//===- GCMetadataPrinter.cpp - Garbage collection infrastructure ----------===// // // The LLVM Compiler Infrastructure // @@ -12,10 +12,11 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/GCMetadataPrinter.h" + using namespace llvm; LLVM_INSTANTIATE_REGISTRY(GCMetadataPrinterRegistry) -GCMetadataPrinter::GCMetadataPrinter() {} +GCMetadataPrinter::GCMetadataPrinter() = default; -GCMetadataPrinter::~GCMetadataPrinter() {} +GCMetadataPrinter::~GCMetadataPrinter() = default; diff --git a/contrib/llvm/lib/CodeGen/GCStrategy.cpp b/contrib/llvm/lib/CodeGen/GCStrategy.cpp index 31ab86f..6be4c16 100644 --- a/contrib/llvm/lib/CodeGen/GCStrategy.cpp +++ b/contrib/llvm/lib/CodeGen/GCStrategy.cpp @@ -1,4 +1,4 @@ -//===-- GCStrategy.cpp - Garbage Collector Description --------------------===// +//===- GCStrategy.cpp - Garbage Collector Description ---------------------===// // // The LLVM Compiler Infrastructure // @@ -18,7 +18,4 @@ using namespace llvm; LLVM_INSTANTIATE_REGISTRY(GCRegistry) -GCStrategy::GCStrategy() - : UseStatepoints(false), NeededSafePoints(0), CustomReadBarriers(false), - CustomWriteBarriers(false), CustomRoots(false), InitRoots(true), - UsesMetadata(false) {} +GCStrategy::GCStrategy() = default; diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp index 1321221..be0c5c2 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -24,40 +24,42 @@ using namespace llvm; bool CallLowering::lowerCall( - MachineIRBuilder &MIRBuilder, const CallInst &CI, unsigned ResReg, + MachineIRBuilder &MIRBuilder, ImmutableCallSite CS, unsigned ResReg, ArrayRef<unsigned> ArgRegs, std::function<unsigned()> GetCalleeReg) const { - auto &DL = CI.getParent()->getParent()->getParent()->getDataLayout(); + auto &DL = CS.getParent()->getParent()->getParent()->getDataLayout(); // First step is to marshall all the function's parameters into the correct // physregs and memory locations. Gather the sequence of argument types that // we'll pass to the assigner function. SmallVector<ArgInfo, 8> OrigArgs; unsigned i = 0; - for (auto &Arg : CI.arg_operands()) { - ArgInfo OrigArg{ArgRegs[i], Arg->getType(), ISD::ArgFlagsTy{}}; - setArgFlags(OrigArg, i + 1, DL, CI); + unsigned NumFixedArgs = CS.getFunctionType()->getNumParams(); + for (auto &Arg : CS.args()) { + ArgInfo OrigArg{ArgRegs[i], Arg->getType(), ISD::ArgFlagsTy{}, + i < NumFixedArgs}; + setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, CS); OrigArgs.push_back(OrigArg); ++i; } MachineOperand Callee = MachineOperand::CreateImm(0); - if (Function *F = CI.getCalledFunction()) + if (const Function *F = CS.getCalledFunction()) Callee = MachineOperand::CreateGA(F, 0); else Callee = MachineOperand::CreateReg(GetCalleeReg(), false); - ArgInfo OrigRet{ResReg, CI.getType(), ISD::ArgFlagsTy{}}; + ArgInfo OrigRet{ResReg, CS.getType(), ISD::ArgFlagsTy{}}; if (!OrigRet.Ty->isVoidTy()) - setArgFlags(OrigRet, AttributeSet::ReturnIndex, DL, CI); + setArgFlags(OrigRet, AttributeList::ReturnIndex, DL, CS); - return lowerCall(MIRBuilder, Callee, OrigRet, OrigArgs); + return lowerCall(MIRBuilder, CS.getCallingConv(), Callee, OrigRet, OrigArgs); } template <typename FuncInfoTy> void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx, const DataLayout &DL, const FuncInfoTy &FuncInfo) const { - const AttributeSet &Attrs = FuncInfo.getAttributes(); + const AttributeList &Attrs = FuncInfo.getAttributes(); if (Attrs.hasAttribute(OpIdx, Attribute::ZExt)) Arg.Flags.setZExt(); if (Attrs.hasAttribute(OpIdx, Attribute::SExt)) @@ -81,8 +83,8 @@ void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx, // For ByVal, alignment should be passed from FE. BE will guess if // this info is not there but there are cases it cannot get right. unsigned FrameAlign; - if (FuncInfo.getParamAlignment(OpIdx)) - FrameAlign = FuncInfo.getParamAlignment(OpIdx); + if (FuncInfo.getParamAlignment(OpIdx - 2)) + FrameAlign = FuncInfo.getParamAlignment(OpIdx - 2); else FrameAlign = getTLI()->getByValTypeAlignment(ElementTy, DL); Arg.Flags.setByValAlign(FrameAlign); @@ -103,7 +105,6 @@ CallLowering::setArgFlags<CallInst>(CallLowering::ArgInfo &Arg, unsigned OpIdx, const CallInst &FuncInfo) const; bool CallLowering::handleAssignments(MachineIRBuilder &MIRBuilder, - CCAssignFn *AssignFn, ArrayRef<ArgInfo> Args, ValueHandler &Handler) const { MachineFunction &MF = MIRBuilder.getMF(); @@ -116,12 +117,20 @@ bool CallLowering::handleAssignments(MachineIRBuilder &MIRBuilder, unsigned NumArgs = Args.size(); for (unsigned i = 0; i != NumArgs; ++i) { MVT CurVT = MVT::getVT(Args[i].Ty); - if (AssignFn(i, CurVT, CurVT, CCValAssign::Full, Args[i].Flags, CCInfo)) + if (Handler.assignArg(i, CurVT, CurVT, CCValAssign::Full, Args[i], CCInfo)) return false; } - for (unsigned i = 0, e = Args.size(); i != e; ++i) { - CCValAssign &VA = ArgLocs[i]; + for (unsigned i = 0, e = Args.size(), j = 0; i != e; ++i, ++j) { + assert(j < ArgLocs.size() && "Skipped too many arg locs"); + + CCValAssign &VA = ArgLocs[j]; + assert(VA.getValNo() == i && "Location doesn't correspond to current arg"); + + if (VA.needsCustom()) { + j += Handler.assignCustomValue(Args[i], makeArrayRef(ArgLocs).slice(j)); + continue; + } if (VA.isRegLoc()) Handler.assignValueToReg(Args[i].Reg, VA.getLocReg(), VA); diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp index fcd2722..29d1209 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp @@ -26,6 +26,7 @@ void llvm::initializeGlobalISel(PassRegistry &Registry) { void llvm::initializeGlobalISel(PassRegistry &Registry) { initializeIRTranslatorPass(Registry); initializeLegalizerPass(Registry); + initializeLocalizerPass(Registry); initializeRegBankSelectPass(Registry); initializeInstructionSelectPass(Registry); } diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 89a042f..ed1bd99 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1,4 +1,4 @@ -//===-- llvm/CodeGen/GlobalISel/IRTranslator.cpp - IRTranslator --*- C++ -*-==// +//===- llvm/CodeGen/GlobalISel/IRTranslator.cpp - IRTranslator ---*- C++ -*-==// // // The LLVM Compiler Infrastructure // @@ -11,43 +11,93 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/GlobalISel/IRTranslator.h" - +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/CodeGen/GlobalISel/CallLowering.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h" #include "llvm/CodeGen/Analysis.h" -#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/GlobalISel/CallLowering.h" +#include "llvm/CodeGen/LowLevelType.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Type.h" +#include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/MC/MCContext.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/LowLevelTypeImpl.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetIntrinsicInfo.h" #include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> +#include <string> +#include <utility> +#include <vector> #define DEBUG_TYPE "irtranslator" using namespace llvm; char IRTranslator::ID = 0; + INITIALIZE_PASS_BEGIN(IRTranslator, DEBUG_TYPE, "IRTranslator LLVM IR -> MI", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(IRTranslator, DEBUG_TYPE, "IRTranslator LLVM IR -> MI", false, false) -static void reportTranslationError(const Value &V, const Twine &Message) { - std::string ErrStorage; - raw_string_ostream Err(ErrStorage); - Err << Message << ": " << V << '\n'; - report_fatal_error(Err.str()); +static void reportTranslationError(MachineFunction &MF, + const TargetPassConfig &TPC, + OptimizationRemarkEmitter &ORE, + OptimizationRemarkMissed &R) { + MF.getProperties().set(MachineFunctionProperties::Property::FailedISel); + + // Print the function name explicitly if we don't have a debug location (which + // makes the diagnostic less useful) or if we're going to emit a raw error. + if (!R.getLocation().isValid() || TPC.isGlobalISelAbortEnabled()) + R << (" (in function: " + MF.getName() + ")").str(); + + if (TPC.isGlobalISelAbortEnabled()) + report_fatal_error(R.getMsg()); + else + ORE.emit(R); } -IRTranslator::IRTranslator() : MachineFunctionPass(ID), MRI(nullptr) { +IRTranslator::IRTranslator() : MachineFunctionPass(ID) { initializeIRTranslatorPass(*PassRegistry::getPassRegistry()); } @@ -56,31 +106,33 @@ void IRTranslator::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } - unsigned IRTranslator::getOrCreateVReg(const Value &Val) { unsigned &ValReg = ValToVReg[&Val]; - // Check if this is the first time we see Val. - if (!ValReg) { - // Fill ValRegsSequence with the sequence of registers - // we need to concat together to produce the value. - assert(Val.getType()->isSized() && - "Don't know how to create an empty vreg"); - unsigned VReg = MRI->createGenericVirtualRegister(LLT{*Val.getType(), *DL}); - ValReg = VReg; - - if (auto CV = dyn_cast<Constant>(&Val)) { - bool Success = translate(*CV, VReg); - if (!Success) { - if (!TPC->isGlobalISelAbortEnabled()) { - MF->getProperties().set( - MachineFunctionProperties::Property::FailedISel); - return VReg; - } - reportTranslationError(Val, "unable to translate constant"); - } + + if (ValReg) + return ValReg; + + // Fill ValRegsSequence with the sequence of registers + // we need to concat together to produce the value. + assert(Val.getType()->isSized() && + "Don't know how to create an empty vreg"); + unsigned VReg = + MRI->createGenericVirtualRegister(getLLTForType(*Val.getType(), *DL)); + ValReg = VReg; + + if (auto CV = dyn_cast<Constant>(&Val)) { + bool Success = translate(*CV, VReg); + if (!Success) { + OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure", + MF->getFunction()->getSubprogram(), + &MF->getFunction()->getEntryBlock()); + R << "unable to translate constant: " << ore::NV("Type", Val.getType()); + reportTranslationError(*MF, *TPC, *ORE, R); + return VReg; } } - return ValReg; + + return VReg; } int IRTranslator::getOrCreateFrameIndex(const AllocaInst &AI) { @@ -112,28 +164,27 @@ unsigned IRTranslator::getMemOpAlignment(const Instruction &I) { } else if (const LoadInst *LI = dyn_cast<LoadInst>(&I)) { Alignment = LI->getAlignment(); ValTy = LI->getType(); - } else if (!TPC->isGlobalISelAbortEnabled()) { - MF->getProperties().set( - MachineFunctionProperties::Property::FailedISel); + } else { + OptimizationRemarkMissed R("gisel-irtranslator", "", &I); + R << "unable to translate memop: " << ore::NV("Opcode", &I); + reportTranslationError(*MF, *TPC, *ORE, R); return 1; - } else - llvm_unreachable("unhandled memory instruction"); + } return Alignment ? Alignment : DL->getABITypeAlignment(ValTy); } -MachineBasicBlock &IRTranslator::getOrCreateBB(const BasicBlock &BB) { +MachineBasicBlock &IRTranslator::getMBB(const BasicBlock &BB) { MachineBasicBlock *&MBB = BBToMBB[&BB]; - if (!MBB) { - MBB = MF->CreateMachineBasicBlock(&BB); - MF->push_back(MBB); - - if (BB.hasAddressTaken()) - MBB->setHasAddressTaken(); - } + assert(MBB && "BasicBlock was not encountered before"); return *MBB; } +void IRTranslator::addMachineCFGPred(CFGEdge Edge, MachineBasicBlock *NewPred) { + assert(NewPred && "new predecessor must be a real MachineBasicBlock"); + MachinePreds[Edge].push_back(NewPred); +} + bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { // FIXME: handle signed/unsigned wrapping flags. @@ -149,6 +200,18 @@ bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U, return true; } +bool IRTranslator::translateFSub(const User &U, MachineIRBuilder &MIRBuilder) { + // -0.0 - X --> G_FNEG + if (isa<Constant>(U.getOperand(0)) && + U.getOperand(0) == ConstantFP::getZeroValueForNegation(U.getType())) { + MIRBuilder.buildInstr(TargetOpcode::G_FNEG) + .addDef(getOrCreateVReg(U)) + .addUse(getOrCreateVReg(*U.getOperand(1))); + return true; + } + return translateBinaryOp(TargetOpcode::G_FSUB, U, MIRBuilder); +} + bool IRTranslator::translateCompare(const User &U, MachineIRBuilder &MIRBuilder) { const CmpInst *CI = dyn_cast<CmpInst>(&U); @@ -158,9 +221,14 @@ bool IRTranslator::translateCompare(const User &U, CmpInst::Predicate Pred = CI ? CI->getPredicate() : static_cast<CmpInst::Predicate>( cast<ConstantExpr>(U).getPredicate()); - if (CmpInst::isIntPredicate(Pred)) MIRBuilder.buildICmp(Pred, Res, Op0, Op1); + else if (Pred == CmpInst::FCMP_FALSE) + MIRBuilder.buildCopy( + Res, getOrCreateVReg(*Constant::getNullValue(CI->getType()))); + else if (Pred == CmpInst::FCMP_TRUE) + MIRBuilder.buildCopy( + Res, getOrCreateVReg(*Constant::getAllOnesValue(CI->getType()))); else MIRBuilder.buildFCmp(Pred, Res, Op0, Op1); @@ -183,18 +251,21 @@ bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) { // We want a G_BRCOND to the true BB followed by an unconditional branch. unsigned Tst = getOrCreateVReg(*BrInst.getCondition()); const BasicBlock &TrueTgt = *cast<BasicBlock>(BrInst.getSuccessor(Succ++)); - MachineBasicBlock &TrueBB = getOrCreateBB(TrueTgt); + MachineBasicBlock &TrueBB = getMBB(TrueTgt); MIRBuilder.buildBrCond(Tst, TrueBB); } const BasicBlock &BrTgt = *cast<BasicBlock>(BrInst.getSuccessor(Succ)); - MachineBasicBlock &TgtBB = getOrCreateBB(BrTgt); - MIRBuilder.buildBr(TgtBB); + MachineBasicBlock &TgtBB = getMBB(BrTgt); + MachineBasicBlock &CurBB = MIRBuilder.getMBB(); + + // If the unconditional target is the layout successor, fallthrough. + if (!CurBB.isLayoutSuccessor(&TgtBB)) + MIRBuilder.buildBr(TgtBB); // Link successors. - MachineBasicBlock &CurBB = MIRBuilder.getMBB(); for (const BasicBlock *Succ : BrInst.successors()) - CurBB.addSuccessor(&getOrCreateBB(*Succ)); + CurBB.addSuccessor(&getMBB(*Succ)); return true; } @@ -209,30 +280,52 @@ bool IRTranslator::translateSwitch(const User &U, const SwitchInst &SwInst = cast<SwitchInst>(U); const unsigned SwCondValue = getOrCreateVReg(*SwInst.getCondition()); + const BasicBlock *OrigBB = SwInst.getParent(); - LLT LLTi1 = LLT(*Type::getInt1Ty(U.getContext()), *DL); + LLT LLTi1 = getLLTForType(*Type::getInt1Ty(U.getContext()), *DL); for (auto &CaseIt : SwInst.cases()) { const unsigned CaseValueReg = getOrCreateVReg(*CaseIt.getCaseValue()); const unsigned Tst = MRI->createGenericVirtualRegister(LLTi1); MIRBuilder.buildICmp(CmpInst::ICMP_EQ, Tst, CaseValueReg, SwCondValue); - MachineBasicBlock &CurBB = MIRBuilder.getMBB(); - MachineBasicBlock &TrueBB = getOrCreateBB(*CaseIt.getCaseSuccessor()); + MachineBasicBlock &CurMBB = MIRBuilder.getMBB(); + const BasicBlock *TrueBB = CaseIt.getCaseSuccessor(); + MachineBasicBlock &TrueMBB = getMBB(*TrueBB); - MIRBuilder.buildBrCond(Tst, TrueBB); - CurBB.addSuccessor(&TrueBB); + MIRBuilder.buildBrCond(Tst, TrueMBB); + CurMBB.addSuccessor(&TrueMBB); + addMachineCFGPred({OrigBB, TrueBB}, &CurMBB); - MachineBasicBlock *FalseBB = + MachineBasicBlock *FalseMBB = MF->CreateMachineBasicBlock(SwInst.getParent()); - MF->push_back(FalseBB); - MIRBuilder.buildBr(*FalseBB); - CurBB.addSuccessor(FalseBB); + // Insert the comparison blocks one after the other. + MF->insert(std::next(CurMBB.getIterator()), FalseMBB); + MIRBuilder.buildBr(*FalseMBB); + CurMBB.addSuccessor(FalseMBB); - MIRBuilder.setMBB(*FalseBB); + MIRBuilder.setMBB(*FalseMBB); } // handle default case - MachineBasicBlock &DefaultBB = getOrCreateBB(*SwInst.getDefaultDest()); - MIRBuilder.buildBr(DefaultBB); - MIRBuilder.getMBB().addSuccessor(&DefaultBB); + const BasicBlock *DefaultBB = SwInst.getDefaultDest(); + MachineBasicBlock &DefaultMBB = getMBB(*DefaultBB); + MIRBuilder.buildBr(DefaultMBB); + MachineBasicBlock &CurMBB = MIRBuilder.getMBB(); + CurMBB.addSuccessor(&DefaultMBB); + addMachineCFGPred({OrigBB, DefaultBB}, &CurMBB); + + return true; +} + +bool IRTranslator::translateIndirectBr(const User &U, + MachineIRBuilder &MIRBuilder) { + const IndirectBrInst &BrInst = cast<IndirectBrInst>(U); + + const unsigned Tgt = getOrCreateVReg(*BrInst.getAddress()); + MIRBuilder.buildBrIndirect(Tgt); + + // Link successors. + MachineBasicBlock &CurBB = MIRBuilder.getMBB(); + for (const BasicBlock *Succ : BrInst.successors()) + CurBB.addSuccessor(&getMBB(*Succ)); return true; } @@ -240,47 +333,38 @@ bool IRTranslator::translateSwitch(const User &U, bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) { const LoadInst &LI = cast<LoadInst>(U); - if (!TPC->isGlobalISelAbortEnabled() && LI.isAtomic()) - return false; - - assert(!LI.isAtomic() && "only non-atomic loads are supported at the moment"); auto Flags = LI.isVolatile() ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone; Flags |= MachineMemOperand::MOLoad; unsigned Res = getOrCreateVReg(LI); unsigned Addr = getOrCreateVReg(*LI.getPointerOperand()); - LLT VTy{*LI.getType(), *DL}, PTy{*LI.getPointerOperand()->getType(), *DL}; + MIRBuilder.buildLoad( Res, Addr, *MF->getMachineMemOperand(MachinePointerInfo(LI.getPointerOperand()), Flags, DL->getTypeStoreSize(LI.getType()), - getMemOpAlignment(LI))); + getMemOpAlignment(LI), AAMDNodes(), nullptr, + LI.getSyncScopeID(), LI.getOrdering())); return true; } bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) { const StoreInst &SI = cast<StoreInst>(U); - - if (!TPC->isGlobalISelAbortEnabled() && SI.isAtomic()) - return false; - - assert(!SI.isAtomic() && "only non-atomic stores supported at the moment"); auto Flags = SI.isVolatile() ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone; Flags |= MachineMemOperand::MOStore; unsigned Val = getOrCreateVReg(*SI.getValueOperand()); unsigned Addr = getOrCreateVReg(*SI.getPointerOperand()); - LLT VTy{*SI.getValueOperand()->getType(), *DL}, - PTy{*SI.getPointerOperand()->getType(), *DL}; MIRBuilder.buildStore( Val, Addr, *MF->getMachineMemOperand( MachinePointerInfo(SI.getPointerOperand()), Flags, DL->getTypeStoreSize(SI.getValueOperand()->getType()), - getMemOpAlignment(SI))); + getMemOpAlignment(SI), AAMDNodes(), nullptr, SI.getSyncScopeID(), + SI.getOrdering())); return true; } @@ -290,6 +374,15 @@ bool IRTranslator::translateExtractValue(const User &U, Type *Int32Ty = Type::getInt32Ty(U.getContext()); SmallVector<Value *, 1> Indices; + // If Src is a single element ConstantStruct, translate extractvalue + // to that element to avoid inserting a cast instruction. + if (auto CS = dyn_cast<ConstantStruct>(Src)) + if (CS->getNumOperands() == 1) { + unsigned Res = getOrCreateVReg(*CS->getOperand(0)); + ValToVReg[&U] = Res; + return true; + } + // getIndexedOffsetInType is designed for GEPs, so the first index is the // usual array element rather than looking into the actual aggregate. Indices.push_back(ConstantInt::get(Int32Ty, 0)); @@ -305,7 +398,7 @@ bool IRTranslator::translateExtractValue(const User &U, uint64_t Offset = 8 * DL->getIndexedOffsetInType(Src->getType(), Indices); unsigned Res = getOrCreateVReg(U); - MIRBuilder.buildExtract(Res, Offset, getOrCreateVReg(*Src)); + MIRBuilder.buildExtract(Res, getOrCreateVReg(*Src), Offset); return true; } @@ -331,29 +424,36 @@ bool IRTranslator::translateInsertValue(const User &U, uint64_t Offset = 8 * DL->getIndexedOffsetInType(Src->getType(), Indices); unsigned Res = getOrCreateVReg(U); - const Value &Inserted = *U.getOperand(1); - MIRBuilder.buildInsert(Res, getOrCreateVReg(*Src), getOrCreateVReg(Inserted), - Offset); + unsigned Inserted = getOrCreateVReg(*U.getOperand(1)); + MIRBuilder.buildInsert(Res, getOrCreateVReg(*Src), Inserted, Offset); return true; } bool IRTranslator::translateSelect(const User &U, MachineIRBuilder &MIRBuilder) { - MIRBuilder.buildSelect(getOrCreateVReg(U), getOrCreateVReg(*U.getOperand(0)), - getOrCreateVReg(*U.getOperand(1)), - getOrCreateVReg(*U.getOperand(2))); + unsigned Res = getOrCreateVReg(U); + unsigned Tst = getOrCreateVReg(*U.getOperand(0)); + unsigned Op0 = getOrCreateVReg(*U.getOperand(1)); + unsigned Op1 = getOrCreateVReg(*U.getOperand(2)); + MIRBuilder.buildSelect(Res, Tst, Op0, Op1); return true; } bool IRTranslator::translateBitCast(const User &U, MachineIRBuilder &MIRBuilder) { - if (LLT{*U.getOperand(0)->getType(), *DL} == LLT{*U.getType(), *DL}) { + // If we're bitcasting to the source type, we can reuse the source vreg. + if (getLLTForType(*U.getOperand(0)->getType(), *DL) == + getLLTForType(*U.getType(), *DL)) { + // Get the source vreg now, to avoid invalidating ValToVReg. + unsigned SrcReg = getOrCreateVReg(*U.getOperand(0)); unsigned &Reg = ValToVReg[&U]; + // If we already assigned a vreg for this bitcast, we can't change that. + // Emit a copy to satisfy the users we already emitted. if (Reg) - MIRBuilder.buildCopy(Reg, getOrCreateVReg(*U.getOperand(0))); + MIRBuilder.buildCopy(Reg, SrcReg); else - Reg = getOrCreateVReg(*U.getOperand(0)); + Reg = SrcReg; return true; } return translateCast(TargetOpcode::G_BITCAST, U, MIRBuilder); @@ -375,9 +475,10 @@ bool IRTranslator::translateGetElementPtr(const User &U, Value &Op0 = *U.getOperand(0); unsigned BaseReg = getOrCreateVReg(Op0); - LLT PtrTy{*Op0.getType(), *DL}; - unsigned PtrSize = DL->getPointerSizeInBits(PtrTy.getAddressSpace()); - LLT OffsetTy = LLT::scalar(PtrSize); + Type *PtrIRTy = Op0.getType(); + LLT PtrTy = getLLTForType(*PtrIRTy, *DL); + Type *OffsetIRTy = DL->getIntPtrType(PtrIRTy); + LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL); int64_t Offset = 0; for (gep_type_iterator GTI = gep_type_begin(&U), E = gep_type_end(&U); @@ -399,8 +500,8 @@ bool IRTranslator::translateGetElementPtr(const User &U, if (Offset != 0) { unsigned NewBaseReg = MRI->createGenericVirtualRegister(PtrTy); - unsigned OffsetReg = MRI->createGenericVirtualRegister(OffsetTy); - MIRBuilder.buildConstant(OffsetReg, Offset); + unsigned OffsetReg = + getOrCreateVReg(*ConstantInt::get(OffsetIRTy, Offset)); MIRBuilder.buildGEP(NewBaseReg, BaseReg, OffsetReg); BaseReg = NewBaseReg; @@ -408,8 +509,8 @@ bool IRTranslator::translateGetElementPtr(const User &U, } // N = N + Idx * ElementSize; - unsigned ElementSizeReg = MRI->createGenericVirtualRegister(OffsetTy); - MIRBuilder.buildConstant(ElementSizeReg, ElementSize); + unsigned ElementSizeReg = + getOrCreateVReg(*ConstantInt::get(OffsetIRTy, ElementSize)); unsigned IdxReg = getOrCreateVReg(*Idx); if (MRI->getType(IdxReg) != OffsetTy) { @@ -428,8 +529,7 @@ bool IRTranslator::translateGetElementPtr(const User &U, } if (Offset != 0) { - unsigned OffsetReg = MRI->createGenericVirtualRegister(OffsetTy); - MIRBuilder.buildConstant(OffsetReg, Offset); + unsigned OffsetReg = getOrCreateVReg(*ConstantInt::get(OffsetIRTy, Offset)); MIRBuilder.buildGEP(getOrCreateVReg(U), BaseReg, OffsetReg); return true; } @@ -438,13 +538,12 @@ bool IRTranslator::translateGetElementPtr(const User &U, return true; } -bool IRTranslator::translateMemcpy(const CallInst &CI, - MachineIRBuilder &MIRBuilder) { - LLT SizeTy{*CI.getArgOperand(2)->getType(), *DL}; - if (cast<PointerType>(CI.getArgOperand(0)->getType())->getAddressSpace() != - 0 || - cast<PointerType>(CI.getArgOperand(1)->getType())->getAddressSpace() != - 0 || +bool IRTranslator::translateMemfunc(const CallInst &CI, + MachineIRBuilder &MIRBuilder, + unsigned ID) { + LLT SizeTy = getLLTForType(*CI.getArgOperand(2)->getType(), *DL); + Type *DstTy = CI.getArgOperand(0)->getType(); + if (cast<PointerType>(DstTy)->getAddressSpace() != 0 || SizeTy.getSizeInBits() != DL->getPointerSizeInBits(0)) return false; @@ -454,14 +553,32 @@ bool IRTranslator::translateMemcpy(const CallInst &CI, Args.emplace_back(getOrCreateVReg(*Arg), Arg->getType()); } - MachineOperand Callee = MachineOperand::CreateES("memcpy"); + const char *Callee; + switch (ID) { + case Intrinsic::memmove: + case Intrinsic::memcpy: { + Type *SrcTy = CI.getArgOperand(1)->getType(); + if(cast<PointerType>(SrcTy)->getAddressSpace() != 0) + return false; + Callee = ID == Intrinsic::memcpy ? "memcpy" : "memmove"; + break; + } + case Intrinsic::memset: + Callee = "memset"; + break; + default: + return false; + } - return CLI->lowerCall(MIRBuilder, Callee, + return CLI->lowerCall(MIRBuilder, CI.getCallingConv(), + MachineOperand::CreateES(Callee), CallLowering::ArgInfo(0, CI.getType()), Args); } void IRTranslator::getStackGuard(unsigned DstReg, MachineIRBuilder &MIRBuilder) { + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + MRI->setRegClass(DstReg, TRI->getPointerRegClass(*MF)); auto MIB = MIRBuilder.buildInstr(TargetOpcode::LOAD_STACK_GUARD); MIB.addDef(DstReg); @@ -482,7 +599,7 @@ void IRTranslator::getStackGuard(unsigned DstReg, bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op, MachineIRBuilder &MIRBuilder) { - LLT Ty{*CI.getOperand(0)->getType(), *DL}; + LLT Ty = getLLTForType(*CI.getOperand(0)->getType(), *DL); LLT s1 = LLT::scalar(1); unsigned Width = Ty.getSizeInBits(); unsigned Res = MRI->createGenericVirtualRegister(Ty); @@ -494,12 +611,12 @@ bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op, .addUse(getOrCreateVReg(*CI.getOperand(1))); if (Op == TargetOpcode::G_UADDE || Op == TargetOpcode::G_USUBE) { - unsigned Zero = MRI->createGenericVirtualRegister(s1); - EntryBuilder.buildConstant(Zero, 0); + unsigned Zero = getOrCreateVReg( + *Constant::getNullValue(Type::getInt1Ty(CI.getContext()))); MIB.addUse(Zero); } - MIRBuilder.buildSequence(getOrCreateVReg(CI), Res, 0, Overflow, Width); + MIRBuilder.buildSequence(getOrCreateVReg(CI), {Res, Overflow}, {0, Width}); return true; } @@ -508,12 +625,83 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, switch (ID) { default: break; - case Intrinsic::dbg_declare: - case Intrinsic::dbg_value: - // FIXME: these obviously need to be supported properly. - MF->getProperties().set( - MachineFunctionProperties::Property::FailedISel); + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + // Stack coloring is not enabled in O0 (which we care about now) so we can + // drop these. Make sure someone notices when we start compiling at higher + // opts though. + if (MF->getTarget().getOptLevel() != CodeGenOpt::None) + return false; + return true; + case Intrinsic::dbg_declare: { + const DbgDeclareInst &DI = cast<DbgDeclareInst>(CI); + assert(DI.getVariable() && "Missing variable"); + + const Value *Address = DI.getAddress(); + if (!Address || isa<UndefValue>(Address)) { + DEBUG(dbgs() << "Dropping debug info for " << DI << "\n"); + return true; + } + + assert(DI.getVariable()->isValidLocationForIntrinsic( + MIRBuilder.getDebugLoc()) && + "Expected inlined-at fields to agree"); + auto AI = dyn_cast<AllocaInst>(Address); + if (AI && AI->isStaticAlloca()) { + // Static allocas are tracked at the MF level, no need for DBG_VALUE + // instructions (in fact, they get ignored if they *do* exist). + MF->setVariableDbgInfo(DI.getVariable(), DI.getExpression(), + getOrCreateFrameIndex(*AI), DI.getDebugLoc()); + } else + MIRBuilder.buildDirectDbgValue(getOrCreateVReg(*Address), + DI.getVariable(), DI.getExpression()); + return true; + } + case Intrinsic::vaend: + // No target I know of cares about va_end. Certainly no in-tree target + // does. Simplest intrinsic ever! + return true; + case Intrinsic::vastart: { + auto &TLI = *MF->getSubtarget().getTargetLowering(); + Value *Ptr = CI.getArgOperand(0); + unsigned ListSize = TLI.getVaListSizeInBits(*DL) / 8; + + MIRBuilder.buildInstr(TargetOpcode::G_VASTART) + .addUse(getOrCreateVReg(*Ptr)) + .addMemOperand(MF->getMachineMemOperand( + MachinePointerInfo(Ptr), MachineMemOperand::MOStore, ListSize, 0)); + return true; + } + case Intrinsic::dbg_value: { + // This form of DBG_VALUE is target-independent. + const DbgValueInst &DI = cast<DbgValueInst>(CI); + const Value *V = DI.getValue(); + assert(DI.getVariable()->isValidLocationForIntrinsic( + MIRBuilder.getDebugLoc()) && + "Expected inlined-at fields to agree"); + if (!V) { + // Currently the optimizer can produce this; insert an undef to + // help debugging. Probably the optimizer should not do this. + MIRBuilder.buildIndirectDbgValue(0, DI.getOffset(), DI.getVariable(), + DI.getExpression()); + } else if (const auto *CI = dyn_cast<Constant>(V)) { + MIRBuilder.buildConstDbgValue(*CI, DI.getOffset(), DI.getVariable(), + DI.getExpression()); + } else { + unsigned Reg = getOrCreateVReg(*V); + // FIXME: This does not handle register-indirect values at offset 0. The + // direct/indirect thing shouldn't really be handled by something as + // implicit as reg+noreg vs reg+imm in the first palce, but it seems + // pretty baked in right now. + if (DI.getOffset() != 0) + MIRBuilder.buildIndirectDbgValue(Reg, DI.getOffset(), DI.getVariable(), + DI.getExpression()); + else + MIRBuilder.buildDirectDbgValue(Reg, DI.getVariable(), + DI.getExpression()); + } return true; + } case Intrinsic::uadd_with_overflow: return translateOverflowIntrinsic(CI, TargetOpcode::G_UADDE, MIRBuilder); case Intrinsic::sadd_with_overflow: @@ -526,8 +714,43 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, return translateOverflowIntrinsic(CI, TargetOpcode::G_UMULO, MIRBuilder); case Intrinsic::smul_with_overflow: return translateOverflowIntrinsic(CI, TargetOpcode::G_SMULO, MIRBuilder); + case Intrinsic::pow: + MIRBuilder.buildInstr(TargetOpcode::G_FPOW) + .addDef(getOrCreateVReg(CI)) + .addUse(getOrCreateVReg(*CI.getArgOperand(0))) + .addUse(getOrCreateVReg(*CI.getArgOperand(1))); + return true; + case Intrinsic::exp: + MIRBuilder.buildInstr(TargetOpcode::G_FEXP) + .addDef(getOrCreateVReg(CI)) + .addUse(getOrCreateVReg(*CI.getArgOperand(0))); + return true; + case Intrinsic::exp2: + MIRBuilder.buildInstr(TargetOpcode::G_FEXP2) + .addDef(getOrCreateVReg(CI)) + .addUse(getOrCreateVReg(*CI.getArgOperand(0))); + return true; + case Intrinsic::log: + MIRBuilder.buildInstr(TargetOpcode::G_FLOG) + .addDef(getOrCreateVReg(CI)) + .addUse(getOrCreateVReg(*CI.getArgOperand(0))); + return true; + case Intrinsic::log2: + MIRBuilder.buildInstr(TargetOpcode::G_FLOG2) + .addDef(getOrCreateVReg(CI)) + .addUse(getOrCreateVReg(*CI.getArgOperand(0))); + return true; + case Intrinsic::fma: + MIRBuilder.buildInstr(TargetOpcode::G_FMA) + .addDef(getOrCreateVReg(CI)) + .addUse(getOrCreateVReg(*CI.getArgOperand(0))) + .addUse(getOrCreateVReg(*CI.getArgOperand(1))) + .addUse(getOrCreateVReg(*CI.getArgOperand(2))); + return true; case Intrinsic::memcpy: - return translateMemcpy(CI, MIRBuilder); + case Intrinsic::memmove: + case Intrinsic::memset: + return translateMemfunc(CI, MIRBuilder, ID); case Intrinsic::eh_typeid_for: { GlobalValue *GV = ExtractTypeInfo(CI.getArgOperand(0)); unsigned Reg = getOrCreateVReg(CI); @@ -546,7 +769,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, getStackGuard(getOrCreateVReg(CI), MIRBuilder); return true; case Intrinsic::stackprotector: { - LLT PtrTy{*CI.getArgOperand(0)->getType(), *DL}; + LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL); unsigned GuardVal = MRI->createGenericVirtualRegister(PtrTy); getStackGuard(GuardVal, MIRBuilder); @@ -564,18 +787,41 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, return false; } +bool IRTranslator::translateInlineAsm(const CallInst &CI, + MachineIRBuilder &MIRBuilder) { + const InlineAsm &IA = cast<InlineAsm>(*CI.getCalledValue()); + if (!IA.getConstraintString().empty()) + return false; + + unsigned ExtraInfo = 0; + if (IA.hasSideEffects()) + ExtraInfo |= InlineAsm::Extra_HasSideEffects; + if (IA.getDialect() == InlineAsm::AD_Intel) + ExtraInfo |= InlineAsm::Extra_AsmDialect; + + MIRBuilder.buildInstr(TargetOpcode::INLINEASM) + .addExternalSymbol(IA.getAsmString().c_str()) + .addImm(ExtraInfo); + + return true; +} + bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) { const CallInst &CI = cast<CallInst>(U); auto TII = MF->getTarget().getIntrinsicInfo(); const Function *F = CI.getCalledFunction(); + if (CI.isInlineAsm()) + return translateInlineAsm(CI, MIRBuilder); + if (!F || !F->isIntrinsic()) { unsigned Res = CI.getType()->isVoidTy() ? 0 : getOrCreateVReg(CI); SmallVector<unsigned, 8> Args; for (auto &Arg: CI.arg_operands()) Args.push_back(getOrCreateVReg(*Arg)); - return CLI->lowerCall(MIRBuilder, CI, Res, Args, [&]() { + MF->getFrameInfo().setHasCalls(true); + return CLI->lowerCall(MIRBuilder, &CI, Res, Args, [&]() { return getOrCreateVReg(*CI.getCalledValue()); }); } @@ -594,11 +840,26 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) { MIRBuilder.buildIntrinsic(ID, Res, !CI.doesNotAccessMemory()); for (auto &Arg : CI.arg_operands()) { - if (ConstantInt *CI = dyn_cast<ConstantInt>(Arg)) - MIB.addImm(CI->getSExtValue()); - else - MIB.addUse(getOrCreateVReg(*Arg)); + // Some intrinsics take metadata parameters. Reject them. + if (isa<MetadataAsValue>(Arg)) + return false; + MIB.addUse(getOrCreateVReg(*Arg)); + } + + // Add a MachineMemOperand if it is a target mem intrinsic. + const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering(); + TargetLowering::IntrinsicInfo Info; + // TODO: Add a GlobalISel version of getTgtMemIntrinsic. + if (TLI.getTgtMemIntrinsic(Info, CI, ID)) { + MachineMemOperand::Flags Flags = + Info.vol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone; + Flags |= + Info.readMem ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore; + uint64_t Size = Info.memVT.getSizeInBits() >> 3; + MIB.addMemOperand(MF->getMachineMemOperand(MachinePointerInfo(Info.ptrVal), + Flags, Size, Info.align)); } + return true; } @@ -610,7 +871,7 @@ bool IRTranslator::translateInvoke(const User &U, const BasicBlock *ReturnBB = I.getSuccessor(0); const BasicBlock *EHPadBB = I.getSuccessor(1); - const Value *Callee(I.getCalledValue()); + const Value *Callee = I.getCalledValue(); const Function *Fn = dyn_cast<Function>(Callee); if (isa<InlineAsm>(Callee)) return false; @@ -627,30 +888,30 @@ bool IRTranslator::translateInvoke(const User &U, if (!isa<LandingPadInst>(EHPadBB->front())) return false; - // Emit the actual call, bracketed by EH_LABELs so that the MF knows about // the region covered by the try. MCSymbol *BeginSymbol = Context.createTempSymbol(); MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(BeginSymbol); unsigned Res = I.getType()->isVoidTy() ? 0 : getOrCreateVReg(I); - SmallVector<CallLowering::ArgInfo, 8> Args; + SmallVector<unsigned, 8> Args; for (auto &Arg: I.arg_operands()) - Args.emplace_back(getOrCreateVReg(*Arg), Arg->getType()); + Args.push_back(getOrCreateVReg(*Arg)); - if (!CLI->lowerCall(MIRBuilder, MachineOperand::CreateGA(Fn, 0), - CallLowering::ArgInfo(Res, I.getType()), Args)) + if (!CLI->lowerCall(MIRBuilder, &I, Res, Args, + [&]() { return getOrCreateVReg(*I.getCalledValue()); })) return false; MCSymbol *EndSymbol = Context.createTempSymbol(); MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(EndSymbol); // FIXME: track probabilities. - MachineBasicBlock &EHPadMBB = getOrCreateBB(*EHPadBB), - &ReturnMBB = getOrCreateBB(*ReturnBB); + MachineBasicBlock &EHPadMBB = getMBB(*EHPadBB), + &ReturnMBB = getMBB(*ReturnBB); MF->addInvoke(&EHPadMBB, BeginSymbol, EndSymbol); MIRBuilder.getMBB().addSuccessor(&ReturnMBB); MIRBuilder.getMBB().addSuccessor(&EHPadMBB); + MIRBuilder.buildBr(ReturnMBB); return true; } @@ -684,37 +945,161 @@ bool IRTranslator::translateLandingPad(const User &U, MIRBuilder.buildInstr(TargetOpcode::EH_LABEL) .addSym(MF->addLandingPad(&MBB)); + LLT Ty = getLLTForType(*LP.getType(), *DL); + unsigned Undef = MRI->createGenericVirtualRegister(Ty); + MIRBuilder.buildUndef(Undef); + + SmallVector<LLT, 2> Tys; + for (Type *Ty : cast<StructType>(LP.getType())->elements()) + Tys.push_back(getLLTForType(*Ty, *DL)); + assert(Tys.size() == 2 && "Only two-valued landingpads are supported"); + // Mark exception register as live in. - SmallVector<unsigned, 2> Regs; - SmallVector<uint64_t, 2> Offsets; - LLT p0 = LLT::pointer(0, DL->getPointerSizeInBits()); - if (unsigned Reg = TLI.getExceptionPointerRegister(PersonalityFn)) { - unsigned VReg = MRI->createGenericVirtualRegister(p0); - MIRBuilder.buildCopy(VReg, Reg); - Regs.push_back(VReg); - Offsets.push_back(0); + unsigned ExceptionReg = TLI.getExceptionPointerRegister(PersonalityFn); + if (!ExceptionReg) + return false; + + MBB.addLiveIn(ExceptionReg); + unsigned VReg = MRI->createGenericVirtualRegister(Tys[0]), + Tmp = MRI->createGenericVirtualRegister(Ty); + MIRBuilder.buildCopy(VReg, ExceptionReg); + MIRBuilder.buildInsert(Tmp, Undef, VReg, 0); + + unsigned SelectorReg = TLI.getExceptionSelectorRegister(PersonalityFn); + if (!SelectorReg) + return false; + + MBB.addLiveIn(SelectorReg); + + // N.b. the exception selector register always has pointer type and may not + // match the actual IR-level type in the landingpad so an extra cast is + // needed. + unsigned PtrVReg = MRI->createGenericVirtualRegister(Tys[0]); + MIRBuilder.buildCopy(PtrVReg, SelectorReg); + + VReg = MRI->createGenericVirtualRegister(Tys[1]); + MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT).addDef(VReg).addUse(PtrVReg); + MIRBuilder.buildInsert(getOrCreateVReg(LP), Tmp, VReg, + Tys[0].getSizeInBits()); + return true; +} + +bool IRTranslator::translateAlloca(const User &U, + MachineIRBuilder &MIRBuilder) { + auto &AI = cast<AllocaInst>(U); + + if (AI.isStaticAlloca()) { + unsigned Res = getOrCreateVReg(AI); + int FI = getOrCreateFrameIndex(AI); + MIRBuilder.buildFrameIndex(Res, FI); + return true; } - if (unsigned Reg = TLI.getExceptionSelectorRegister(PersonalityFn)) { - unsigned VReg = MRI->createGenericVirtualRegister(p0); - MIRBuilder.buildCopy(VReg, Reg); - Regs.push_back(VReg); - Offsets.push_back(p0.getSizeInBits()); + // Now we're in the harder dynamic case. + Type *Ty = AI.getAllocatedType(); + unsigned Align = + std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI.getAlignment()); + + unsigned NumElts = getOrCreateVReg(*AI.getArraySize()); + + Type *IntPtrIRTy = DL->getIntPtrType(AI.getType()); + LLT IntPtrTy = getLLTForType(*IntPtrIRTy, *DL); + if (MRI->getType(NumElts) != IntPtrTy) { + unsigned ExtElts = MRI->createGenericVirtualRegister(IntPtrTy); + MIRBuilder.buildZExtOrTrunc(ExtElts, NumElts); + NumElts = ExtElts; + } + + unsigned AllocSize = MRI->createGenericVirtualRegister(IntPtrTy); + unsigned TySize = + getOrCreateVReg(*ConstantInt::get(IntPtrIRTy, -DL->getTypeAllocSize(Ty))); + MIRBuilder.buildMul(AllocSize, NumElts, TySize); + + LLT PtrTy = getLLTForType(*AI.getType(), *DL); + auto &TLI = *MF->getSubtarget().getTargetLowering(); + unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); + + unsigned SPTmp = MRI->createGenericVirtualRegister(PtrTy); + MIRBuilder.buildCopy(SPTmp, SPReg); + + unsigned AllocTmp = MRI->createGenericVirtualRegister(PtrTy); + MIRBuilder.buildGEP(AllocTmp, SPTmp, AllocSize); + + // Handle alignment. We have to realign if the allocation granule was smaller + // than stack alignment, or the specific alloca requires more than stack + // alignment. + unsigned StackAlign = + MF->getSubtarget().getFrameLowering()->getStackAlignment(); + Align = std::max(Align, StackAlign); + if (Align > StackAlign || DL->getTypeAllocSize(Ty) % StackAlign != 0) { + // Round the size of the allocation up to the stack alignment size + // by add SA-1 to the size. This doesn't overflow because we're computing + // an address inside an alloca. + unsigned AlignedAlloc = MRI->createGenericVirtualRegister(PtrTy); + MIRBuilder.buildPtrMask(AlignedAlloc, AllocTmp, Log2_32(Align)); + AllocTmp = AlignedAlloc; } - MIRBuilder.buildSequence(getOrCreateVReg(LP), Regs, Offsets); + MIRBuilder.buildCopy(SPReg, AllocTmp); + MIRBuilder.buildCopy(getOrCreateVReg(AI), AllocTmp); + + MF->getFrameInfo().CreateVariableSizedObject(Align ? Align : 1, &AI); + assert(MF->getFrameInfo().hasVarSizedObjects()); return true; } -bool IRTranslator::translateStaticAlloca(const AllocaInst &AI, - MachineIRBuilder &MIRBuilder) { - if (!TPC->isGlobalISelAbortEnabled() && !AI.isStaticAlloca()) - return false; +bool IRTranslator::translateVAArg(const User &U, MachineIRBuilder &MIRBuilder) { + // FIXME: We may need more info about the type. Because of how LLT works, + // we're completely discarding the i64/double distinction here (amongst + // others). Fortunately the ABIs I know of where that matters don't use va_arg + // anyway but that's not guaranteed. + MIRBuilder.buildInstr(TargetOpcode::G_VAARG) + .addDef(getOrCreateVReg(U)) + .addUse(getOrCreateVReg(*U.getOperand(0))) + .addImm(DL->getABITypeAlignment(U.getType())); + return true; +} - assert(AI.isStaticAlloca() && "only handle static allocas now"); - unsigned Res = getOrCreateVReg(AI); - int FI = getOrCreateFrameIndex(AI); - MIRBuilder.buildFrameIndex(Res, FI); +bool IRTranslator::translateInsertElement(const User &U, + MachineIRBuilder &MIRBuilder) { + // If it is a <1 x Ty> vector, use the scalar as it is + // not a legal vector type in LLT. + if (U.getType()->getVectorNumElements() == 1) { + unsigned Elt = getOrCreateVReg(*U.getOperand(1)); + ValToVReg[&U] = Elt; + return true; + } + unsigned Res = getOrCreateVReg(U); + unsigned Val = getOrCreateVReg(*U.getOperand(0)); + unsigned Elt = getOrCreateVReg(*U.getOperand(1)); + unsigned Idx = getOrCreateVReg(*U.getOperand(2)); + MIRBuilder.buildInsertVectorElement(Res, Val, Elt, Idx); + return true; +} + +bool IRTranslator::translateExtractElement(const User &U, + MachineIRBuilder &MIRBuilder) { + // If it is a <1 x Ty> vector, use the scalar as it is + // not a legal vector type in LLT. + if (U.getOperand(0)->getType()->getVectorNumElements() == 1) { + unsigned Elt = getOrCreateVReg(*U.getOperand(0)); + ValToVReg[&U] = Elt; + return true; + } + unsigned Res = getOrCreateVReg(U); + unsigned Val = getOrCreateVReg(*U.getOperand(0)); + unsigned Idx = getOrCreateVReg(*U.getOperand(1)); + MIRBuilder.buildExtractVectorElement(Res, Val, Idx); + return true; +} + +bool IRTranslator::translateShuffleVector(const User &U, + MachineIRBuilder &MIRBuilder) { + MIRBuilder.buildInstr(TargetOpcode::G_SHUFFLE_VECTOR) + .addDef(getOrCreateVReg(U)) + .addUse(getOrCreateVReg(*U.getOperand(0))) + .addUse(getOrCreateVReg(*U.getOperand(1))) + .addUse(getOrCreateVReg(*U.getOperand(2))); return true; } @@ -736,11 +1121,21 @@ void IRTranslator::finishPendingPhis() { // won't create extra control flow here, otherwise we need to find the // dominating predecessor here (or perhaps force the weirder IRTranslators // to provide a simple boundary). + SmallSet<const BasicBlock *, 4> HandledPreds; + for (unsigned i = 0; i < PI->getNumIncomingValues(); ++i) { - assert(BBToMBB[PI->getIncomingBlock(i)]->isSuccessor(MIB->getParent()) && - "I appear to have misunderstood Machine PHIs"); - MIB.addUse(getOrCreateVReg(*PI->getIncomingValue(i))); - MIB.addMBB(BBToMBB[PI->getIncomingBlock(i)]); + auto IRPred = PI->getIncomingBlock(i); + if (HandledPreds.count(IRPred)) + continue; + + HandledPreds.insert(IRPred); + unsigned ValReg = getOrCreateVReg(*PI->getIncomingValue(i)); + for (auto Pred : getMachinePredBBs({IRPred, PI->getParent()})) { + assert(Pred->isSuccessor(MIB->getParent()) && + "incorrect CFG at MachineBasicBlock level"); + MIB.addUse(ValReg); + MIB.addMBB(Pred); + } } } } @@ -752,9 +1147,7 @@ bool IRTranslator::translate(const Instruction &Inst) { case Instruction::OPCODE: return translate##OPCODE(Inst, CurBuilder); #include "llvm/IR/Instruction.def" default: - if (!TPC->isGlobalISelAbortEnabled()) - return false; - llvm_unreachable("unknown opcode"); + return false; } } @@ -764,25 +1157,68 @@ bool IRTranslator::translate(const Constant &C, unsigned Reg) { else if (auto CF = dyn_cast<ConstantFP>(&C)) EntryBuilder.buildFConstant(Reg, *CF); else if (isa<UndefValue>(C)) - EntryBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(Reg); + EntryBuilder.buildUndef(Reg); else if (isa<ConstantPointerNull>(C)) EntryBuilder.buildConstant(Reg, 0); else if (auto GV = dyn_cast<GlobalValue>(&C)) EntryBuilder.buildGlobalValue(Reg, GV); - else if (auto CE = dyn_cast<ConstantExpr>(&C)) { + else if (auto CAZ = dyn_cast<ConstantAggregateZero>(&C)) { + if (!CAZ->getType()->isVectorTy()) + return false; + // Return the scalar if it is a <1 x Ty> vector. + if (CAZ->getNumElements() == 1) + return translate(*CAZ->getElementValue(0u), Reg); + std::vector<unsigned> Ops; + for (unsigned i = 0; i < CAZ->getNumElements(); ++i) { + Constant &Elt = *CAZ->getElementValue(i); + Ops.push_back(getOrCreateVReg(Elt)); + } + EntryBuilder.buildMerge(Reg, Ops); + } else if (auto CV = dyn_cast<ConstantDataVector>(&C)) { + // Return the scalar if it is a <1 x Ty> vector. + if (CV->getNumElements() == 1) + return translate(*CV->getElementAsConstant(0), Reg); + std::vector<unsigned> Ops; + for (unsigned i = 0; i < CV->getNumElements(); ++i) { + Constant &Elt = *CV->getElementAsConstant(i); + Ops.push_back(getOrCreateVReg(Elt)); + } + EntryBuilder.buildMerge(Reg, Ops); + } else if (auto CE = dyn_cast<ConstantExpr>(&C)) { switch(CE->getOpcode()) { #define HANDLE_INST(NUM, OPCODE, CLASS) \ case Instruction::OPCODE: return translate##OPCODE(*CE, EntryBuilder); #include "llvm/IR/Instruction.def" default: - if (!TPC->isGlobalISelAbortEnabled()) - return false; - llvm_unreachable("unknown opcode"); + return false; + } + } else if (auto CS = dyn_cast<ConstantStruct>(&C)) { + // Return the element if it is a single element ConstantStruct. + if (CS->getNumOperands() == 1) { + unsigned EltReg = getOrCreateVReg(*CS->getOperand(0)); + EntryBuilder.buildCast(Reg, EltReg); + return true; + } + SmallVector<unsigned, 4> Ops; + SmallVector<uint64_t, 4> Indices; + uint64_t Offset = 0; + for (unsigned i = 0; i < CS->getNumOperands(); ++i) { + unsigned OpReg = getOrCreateVReg(*CS->getOperand(i)); + Ops.push_back(OpReg); + Indices.push_back(Offset); + Offset += MRI->getType(OpReg).getSizeInBits(); + } + EntryBuilder.buildSequence(Reg, Ops, Indices); + } else if (auto CV = dyn_cast<ConstantVector>(&C)) { + if (CV->getNumOperands() == 1) + return translate(*CV->getOperand(0), Reg); + SmallVector<unsigned, 4> Ops; + for (unsigned i = 0; i < CV->getNumOperands(); ++i) { + Ops.push_back(getOrCreateVReg(*CV->getOperand(i))); } - } else if (!TPC->isGlobalISelAbortEnabled()) + EntryBuilder.buildMerge(Reg, Ops); + } else return false; - else - llvm_unreachable("unhandled constant kind"); return true; } @@ -793,7 +1229,12 @@ void IRTranslator::finalizeFunction() { PendingPHIs.clear(); ValToVReg.clear(); FrameIndices.clear(); - Constants.clear(); + MachinePreds.clear(); + // MachineIRBuilder::DebugLoc can outlive the DILocation it holds. Clear it + // to avoid accessing free’d memory (in runOnMachineFunction) and to avoid + // destroying it twice (in ~IRTranslator() and ~LLVMContext()) + EntryBuilder = MachineIRBuilder(); + CurBuilder = MachineIRBuilder(); } bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) { @@ -807,85 +1248,97 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) { MRI = &MF->getRegInfo(); DL = &F.getParent()->getDataLayout(); TPC = &getAnalysis<TargetPassConfig>(); + ORE = llvm::make_unique<OptimizationRemarkEmitter>(&F); assert(PendingPHIs.empty() && "stale PHIs"); - // Setup a separate basic-block for the arguments and constants, falling - // through to the IR-level Function's entry block. + // Release the per-function state when we return, whether we succeeded or not. + auto FinalizeOnReturn = make_scope_exit([this]() { finalizeFunction(); }); + + // Setup a separate basic-block for the arguments and constants MachineBasicBlock *EntryBB = MF->CreateMachineBasicBlock(); MF->push_back(EntryBB); - EntryBB->addSuccessor(&getOrCreateBB(F.front())); EntryBuilder.setMBB(*EntryBB); + // Create all blocks, in IR order, to preserve the layout. + for (const BasicBlock &BB: F) { + auto *&MBB = BBToMBB[&BB]; + + MBB = MF->CreateMachineBasicBlock(&BB); + MF->push_back(MBB); + + if (BB.hasAddressTaken()) + MBB->setHasAddressTaken(); + } + + // Make our arguments/constants entry block fallthrough to the IR entry block. + EntryBB->addSuccessor(&getMBB(F.front())); + // Lower the actual args into this basic block. SmallVector<unsigned, 8> VRegArgs; for (const Argument &Arg: F.args()) VRegArgs.push_back(getOrCreateVReg(Arg)); - bool Succeeded = CLI->lowerFormalArguments(EntryBuilder, F, VRegArgs); - if (!Succeeded) { - if (!TPC->isGlobalISelAbortEnabled()) { - MF->getProperties().set( - MachineFunctionProperties::Property::FailedISel); - finalizeFunction(); - return false; - } - report_fatal_error("Unable to lower arguments"); + if (!CLI->lowerFormalArguments(EntryBuilder, F, VRegArgs)) { + OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure", + MF->getFunction()->getSubprogram(), + &MF->getFunction()->getEntryBlock()); + R << "unable to lower arguments: " << ore::NV("Prototype", F.getType()); + reportTranslationError(*MF, *TPC, *ORE, R); + return false; } // And translate the function! for (const BasicBlock &BB: F) { - MachineBasicBlock &MBB = getOrCreateBB(BB); + MachineBasicBlock &MBB = getMBB(BB); // Set the insertion point of all the following translations to // the end of this basic block. CurBuilder.setMBB(MBB); for (const Instruction &Inst: BB) { - Succeeded &= translate(Inst); - if (!Succeeded) { - if (TPC->isGlobalISelAbortEnabled()) - reportTranslationError(Inst, "unable to translate instruction"); - MF->getProperties().set( - MachineFunctionProperties::Property::FailedISel); - break; - } - } - } - - if (Succeeded) { - finishPendingPhis(); - - // Now that the MachineFrameInfo has been configured, no further changes to - // the reserved registers are possible. - MRI->freezeReservedRegs(*MF); - - // Merge the argument lowering and constants block with its single - // successor, the LLVM-IR entry block. We want the basic block to - // be maximal. - assert(EntryBB->succ_size() == 1 && - "Custom BB used for lowering should have only one successor"); - // Get the successor of the current entry block. - MachineBasicBlock &NewEntryBB = **EntryBB->succ_begin(); - assert(NewEntryBB.pred_size() == 1 && - "LLVM-IR entry block has a predecessor!?"); - // Move all the instruction from the current entry block to the - // new entry block. - NewEntryBB.splice(NewEntryBB.begin(), EntryBB, EntryBB->begin(), - EntryBB->end()); - - // Update the live-in information for the new entry block. - for (const MachineBasicBlock::RegisterMaskPair &LiveIn : EntryBB->liveins()) - NewEntryBB.addLiveIn(LiveIn); - NewEntryBB.sortUniqueLiveIns(); + if (translate(Inst)) + continue; - // Get rid of the now empty basic block. - EntryBB->removeSuccessor(&NewEntryBB); - MF->remove(EntryBB); + std::string InstStrStorage; + raw_string_ostream InstStr(InstStrStorage); + InstStr << Inst; - assert(&MF->front() == &NewEntryBB && - "New entry wasn't next in the list of basic block!"); + OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure", + Inst.getDebugLoc(), &BB); + R << "unable to translate instruction: " << ore::NV("Opcode", &Inst) + << ": '" << InstStr.str() << "'"; + reportTranslationError(*MF, *TPC, *ORE, R); + return false; + } } - finalizeFunction(); + finishPendingPhis(); + + // Merge the argument lowering and constants block with its single + // successor, the LLVM-IR entry block. We want the basic block to + // be maximal. + assert(EntryBB->succ_size() == 1 && + "Custom BB used for lowering should have only one successor"); + // Get the successor of the current entry block. + MachineBasicBlock &NewEntryBB = **EntryBB->succ_begin(); + assert(NewEntryBB.pred_size() == 1 && + "LLVM-IR entry block has a predecessor!?"); + // Move all the instruction from the current entry block to the + // new entry block. + NewEntryBB.splice(NewEntryBB.begin(), EntryBB, EntryBB->begin(), + EntryBB->end()); + + // Update the live-in information for the new entry block. + for (const MachineBasicBlock::RegisterMaskPair &LiveIn : EntryBB->liveins()) + NewEntryBB.addLiveIn(LiveIn); + NewEntryBB.sortUniqueLiveIns(); + + // Get rid of the now empty basic block. + EntryBB->removeSuccessor(&NewEntryBB); + MF->remove(EntryBB); + MF->DeleteMachineBasicBlock(EntryBB); + + assert(&MF->front() == &NewEntryBB && + "New entry wasn't next in the list of basic block!"); return false; } diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp index 1d205cd..a16e14f 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp @@ -12,14 +12,19 @@ #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/Twine.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetSubtargetInfo.h" #define DEBUG_TYPE "instruction-select" @@ -44,17 +49,14 @@ void InstructionSelect::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } -static void reportSelectionError(const MachineInstr *MI, const Twine &Message) { - const MachineFunction &MF = *MI->getParent()->getParent(); - std::string ErrStorage; - raw_string_ostream Err(ErrStorage); - Err << Message << ":\nIn function: " << MF.getName() << '\n'; - if (MI) - Err << *MI << '\n'; - report_fatal_error(Err.str()); -} - bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + // No matter what happens, whether we successfully select the function or not, + // nothing is going to use the vreg types after us. Make sure they disappear. + auto ClearVRegTypesOnReturn = + make_scope_exit([&]() { MRI.getVRegToType().clear(); }); + // If the ISel pipeline failed, do not bother running that pass. if (MF.getProperties().hasProperty( MachineFunctionProperties::Property::FailedISel)) @@ -66,10 +68,10 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { const InstructionSelector *ISel = MF.getSubtarget().getInstructionSelector(); assert(ISel && "Cannot work without InstructionSelector"); - // FIXME: freezeReservedRegs is now done in IRTranslator, but there are many - // other MF/MFI fields we need to initialize. + // An optimization remark emitter. Used to report failures. + MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr); - const MachineRegisterInfo &MRI = MF.getRegInfo(); + // FIXME: There are many other MF/MFI fields we need to initialize. #ifndef NDEBUG // Check that our input is fully legal: we require the function to have the @@ -80,17 +82,19 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { // that it has the same layering problem, but we only use inline methods so // end up not needing to link against the GlobalISel library. if (const LegalizerInfo *MLI = MF.getSubtarget().getLegalizerInfo()) - for (const MachineBasicBlock &MBB : MF) - for (const MachineInstr &MI : MBB) - if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI)) - reportSelectionError(&MI, "Instruction is not legal"); + for (MachineBasicBlock &MBB : MF) + for (MachineInstr &MI : MBB) + if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI)) { + reportGISelFailure(MF, TPC, MORE, "gisel-select", + "instruction is not legal", MI); + return false; + } #endif // FIXME: We could introduce new blocks and will need to fix the outer loop. // Until then, keep track of the number of blocks to assert that we don't. const size_t NumBlocks = MF.size(); - bool Failed = false; for (MachineBasicBlock *MBB : post_order(&MF)) { if (MBB->empty()) continue; @@ -115,14 +119,19 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "Selecting: \n " << MI); + // We could have folded this instruction away already, making it dead. + // If so, erase it. + if (isTriviallyDead(MI, MRI)) { + DEBUG(dbgs() << "Is dead; erasing.\n"); + MI.eraseFromParentAndMarkDBGValuesForRemoval(); + continue; + } + if (!ISel->select(MI)) { - if (TPC.isGlobalISelAbortEnabled()) - // FIXME: It would be nice to dump all inserted instructions. It's - // not - // obvious how, esp. considering select() can insert after MI. - reportSelectionError(&MI, "Cannot select"); - Failed = true; - break; + // FIXME: It would be nice to dump all inserted instructions. It's + // not obvious how, esp. considering select() can insert after MI. + reportGISelFailure(MF, TPC, MORE, "gisel-select", "cannot select", MI); + return false; } // Dump the range of instructions that MI expanded into. @@ -136,39 +145,47 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { } } + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + // Now that selection is complete, there are no more generic vregs. Verify // that the size of the now-constrained vreg is unchanged and that it has a // register class. for (auto &VRegToType : MRI.getVRegToType()) { unsigned VReg = VRegToType.first; auto *RC = MRI.getRegClassOrNull(VReg); - auto *MI = MRI.def_instr_begin(VReg) == MRI.def_instr_end() - ? nullptr - : &*MRI.def_instr_begin(VReg); - if (!RC) { - if (TPC.isGlobalISelAbortEnabled()) - reportSelectionError(MI, "VReg as no regclass after selection"); - Failed = true; - break; - } + MachineInstr *MI = nullptr; + if (!MRI.def_empty(VReg)) + MI = &*MRI.def_instr_begin(VReg); + else if (!MRI.use_empty(VReg)) + MI = &*MRI.use_instr_begin(VReg); + + if (MI && !RC) { + reportGISelFailure(MF, TPC, MORE, "gisel-select", + "VReg has no regclass after selection", *MI); + return false; + } else if (!RC) + continue; if (VRegToType.second.isValid() && - VRegToType.second.getSizeInBits() > (RC->getSize() * 8)) { - if (TPC.isGlobalISelAbortEnabled()) - reportSelectionError( - MI, "VReg has explicit size different from class size"); - Failed = true; - break; + VRegToType.second.getSizeInBits() > TRI.getRegSizeInBits(*RC)) { + reportGISelFailure(MF, TPC, MORE, "gisel-select", + "VReg has explicit size different from class size", + *MI); + return false; } } - MRI.getVRegToType().clear(); - - if (!TPC.isGlobalISelAbortEnabled() && (Failed || MF.size() != NumBlocks)) { - MF.getProperties().set(MachineFunctionProperties::Property::FailedISel); + if (MF.size() != NumBlocks) { + MachineOptimizationRemarkMissed R("gisel-select", "GISelFailure", + MF.getFunction()->getSubprogram(), + /*MBB=*/nullptr); + R << "inserting blocks is not supported yet"; + reportGISelFailure(MF, TPC, MORE, R); return false; } - assert(MF.size() == NumBlocks && "Inserting blocks is not supported yet"); + + auto &TLI = *MF.getSubtarget().getTargetLowering(); + TLI.finalizeLowering(MF); // FIXME: Should we accurately track changes? return true; diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp index 5c34da0..bf42722 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp @@ -1,4 +1,4 @@ -//===- llvm/CodeGen/GlobalISel/InstructionSelector.cpp -----------*- C++ -*-==// +//===- llvm/CodeGen/GlobalISel/InstructionSelector.cpp --------------------===// // // The LLVM Compiler Infrastructure // @@ -11,17 +11,41 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" -#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/IR/Constants.h" #include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetRegisterInfo.h" +#include <cassert> #define DEBUG_TYPE "instructionselector" using namespace llvm; -InstructionSelector::InstructionSelector() {} +InstructionSelector::MatcherState::MatcherState(unsigned MaxRenderers) + : Renderers(MaxRenderers, nullptr), MIs() {} + +InstructionSelector::InstructionSelector() = default; + +bool InstructionSelector::constrainOperandRegToRegClass( + MachineInstr &I, unsigned OpIdx, const TargetRegisterClass &RC, + const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, + const RegisterBankInfo &RBI) const { + MachineBasicBlock &MBB = *I.getParent(); + MachineFunction &MF = *MBB.getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + return + constrainRegToClass(MRI, TII, RBI, I, I.getOperand(OpIdx).getReg(), RC); +} bool InstructionSelector::constrainSelectedInstRegOperands( MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, @@ -55,6 +79,28 @@ bool InstructionSelector::constrainSelectedInstRegOperands( // constrainOperandRegClass does that for us. MO.setReg(constrainOperandRegClass(MF, TRI, MRI, TII, RBI, I, I.getDesc(), Reg, OpI)); + + // Tie uses to defs as indicated in MCInstrDesc if this hasn't already been + // done. + if (MO.isUse()) { + int DefIdx = I.getDesc().getOperandConstraint(OpI, MCOI::TIED_TO); + if (DefIdx != -1 && !I.isRegTiedToUseOperand(DefIdx)) + I.tieOperands(DefIdx, OpI); + } } return true; } + +bool InstructionSelector::isOperandImmEqual( + const MachineOperand &MO, int64_t Value, + const MachineRegisterInfo &MRI) const { + if (MO.isReg() && MO.getReg()) + if (auto VRegVal = getConstantVRegVal(MO.getReg(), MRI)) + return *VRegVal == Value; + return false; +} + +bool InstructionSelector::isObviouslySafeToFold(MachineInstr &MI) const { + return !MI.mayLoadOrStore() && !MI.hasUnmodeledSideEffects() && + MI.implicit_operands().begin() == MI.implicit_operands().end(); +} diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp index e863568..b699156 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp @@ -15,13 +15,16 @@ #include "llvm/CodeGen/GlobalISel/Legalizer.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" -#include "llvm/CodeGen/GlobalISel/Legalizer.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/Support/Debug.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <iterator> + #define DEBUG_TYPE "legalizer" using namespace llvm; @@ -47,71 +50,79 @@ void Legalizer::getAnalysisUsage(AnalysisUsage &AU) const { void Legalizer::init(MachineFunction &MF) { } -bool Legalizer::combineExtracts(MachineInstr &MI, MachineRegisterInfo &MRI, - const TargetInstrInfo &TII) { - bool Changed = false; - if (MI.getOpcode() != TargetOpcode::G_EXTRACT) - return Changed; +bool Legalizer::combineMerges(MachineInstr &MI, MachineRegisterInfo &MRI, + const TargetInstrInfo &TII, + MachineIRBuilder &MIRBuilder) { + if (MI.getOpcode() != TargetOpcode::G_UNMERGE_VALUES) + return false; - unsigned NumDefs = (MI.getNumOperands() - 1) / 2; + unsigned NumDefs = MI.getNumOperands() - 1; unsigned SrcReg = MI.getOperand(NumDefs).getReg(); - MachineInstr &SeqI = *MRI.def_instr_begin(SrcReg); - if (SeqI.getOpcode() != TargetOpcode::G_SEQUENCE) - return Changed; - - unsigned NumSeqSrcs = (SeqI.getNumOperands() - 1) / 2; - bool AllDefsReplaced = true; - - // Try to match each register extracted with a corresponding insertion formed - // by the G_SEQUENCE. - for (unsigned Idx = 0, SeqIdx = 0; Idx < NumDefs; ++Idx) { - MachineOperand &ExtractMO = MI.getOperand(Idx); - assert(ExtractMO.isReg() && ExtractMO.isDef() && - "unexpected extract operand"); - - unsigned ExtractReg = ExtractMO.getReg(); - unsigned ExtractPos = MI.getOperand(NumDefs + Idx + 1).getImm(); - - while (SeqIdx < NumSeqSrcs && - SeqI.getOperand(2 * SeqIdx + 2).getImm() < ExtractPos) - ++SeqIdx; - - if (SeqIdx == NumSeqSrcs) { - AllDefsReplaced = false; - continue; - } + MachineInstr &MergeI = *MRI.def_instr_begin(SrcReg); + if (MergeI.getOpcode() != TargetOpcode::G_MERGE_VALUES) + return false; - unsigned OrigReg = SeqI.getOperand(2 * SeqIdx + 1).getReg(); - if (SeqI.getOperand(2 * SeqIdx + 2).getImm() != ExtractPos || - MRI.getType(OrigReg) != MRI.getType(ExtractReg)) { - AllDefsReplaced = false; - continue; - } + const unsigned NumMergeRegs = MergeI.getNumOperands() - 1; + + if (NumMergeRegs < NumDefs) { + if (NumDefs % NumMergeRegs != 0) + return false; + + MIRBuilder.setInstr(MI); + // Transform to UNMERGEs, for example + // %1 = G_MERGE_VALUES %4, %5 + // %9, %10, %11, %12 = G_UNMERGE_VALUES %1 + // to + // %9, %10 = G_UNMERGE_VALUES %4 + // %11, %12 = G_UNMERGE_VALUES %5 - assert(!TargetRegisterInfo::isPhysicalRegister(OrigReg) && - "unexpected physical register in G_SEQUENCE"); + const unsigned NewNumDefs = NumDefs / NumMergeRegs; + for (unsigned Idx = 0; Idx < NumMergeRegs; ++Idx) { + SmallVector<unsigned, 2> DstRegs; + for (unsigned j = 0, DefIdx = Idx * NewNumDefs; j < NewNumDefs; + ++j, ++DefIdx) + DstRegs.push_back(MI.getOperand(DefIdx).getReg()); - // Finally we can replace the uses. - for (auto &Use : MRI.use_operands(ExtractReg)) { - Changed = true; - Use.setReg(OrigReg); + MIRBuilder.buildUnmerge(DstRegs, MergeI.getOperand(Idx + 1).getReg()); } - } - if (AllDefsReplaced) { - // If SeqI was the next instruction in the BB and we removed it, we'd break - // the outer iteration. - assert(std::next(MachineBasicBlock::iterator(MI)) != SeqI && - "G_SEQUENCE does not dominate G_EXTRACT"); + } else if (NumMergeRegs > NumDefs) { + if (NumMergeRegs % NumDefs != 0) + return false; + + MIRBuilder.setInstr(MI); + // Transform to MERGEs + // %6 = G_MERGE_VALUES %17, %18, %19, %20 + // %7, %8 = G_UNMERGE_VALUES %6 + // to + // %7 = G_MERGE_VALUES %17, %18 + // %8 = G_MERGE_VALUES %19, %20 + + const unsigned NumRegs = NumMergeRegs / NumDefs; + for (unsigned DefIdx = 0; DefIdx < NumDefs; ++DefIdx) { + SmallVector<unsigned, 2> Regs; + for (unsigned j = 0, Idx = NumRegs * DefIdx + 1; j < NumRegs; ++j, ++Idx) + Regs.push_back(MergeI.getOperand(Idx).getReg()); + + MIRBuilder.buildMerge(MI.getOperand(DefIdx).getReg(), Regs); + } - MI.eraseFromParent(); + } else { + // FIXME: is a COPY appropriate if the types mismatch? We know both + // registers are allocatable by now. + if (MRI.getType(MI.getOperand(0).getReg()) != + MRI.getType(MergeI.getOperand(1).getReg())) + return false; - if (MRI.use_empty(SrcReg)) - SeqI.eraseFromParent(); - Changed = true; + for (unsigned Idx = 0; Idx < NumDefs; ++Idx) + MRI.replaceRegWith(MI.getOperand(Idx).getReg(), + MergeI.getOperand(Idx + 1).getReg()); } - return Changed; + MI.eraseFromParent(); + if (MRI.use_empty(MergeI.getOperand(0).getReg())) + MergeI.eraseFromParent(); + return true; } bool Legalizer::runOnMachineFunction(MachineFunction &MF) { @@ -122,7 +133,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "Legalize Machine IR for: " << MF.getName() << '\n'); init(MF); const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); - const LegalizerInfo &LegalizerInfo = *MF.getSubtarget().getLegalizerInfo(); + MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr); LegalizerHelper Helper(MF); // FIXME: an instruction may need more than one pass before it is legal. For @@ -132,7 +143,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) { // convergence for performance reasons. bool Changed = false; MachineBasicBlock::iterator NextMI; - for (auto &MBB : MF) + for (auto &MBB : MF) { for (auto MI = MBB.begin(); MI != MBB.end(); MI = NextMI) { // Get the next Instruction before we try to legalize, because there's a // good chance MI will be deleted. @@ -142,27 +153,52 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) { // and are assumed to be legal. if (!isPreISelGenericOpcode(MI->getOpcode())) continue; - - auto Res = Helper.legalizeInstr(*MI, LegalizerInfo); - - // Error out if we couldn't legalize this instruction. We may want to fall - // back to DAG ISel instead in the future. - if (Res == LegalizerHelper::UnableToLegalize) { - if (!TPC.isGlobalISelAbortEnabled()) { - MF.getProperties().set( - MachineFunctionProperties::Property::FailedISel); - return false; + unsigned NumNewInsns = 0; + SmallVector<MachineInstr *, 4> WorkList; + Helper.MIRBuilder.recordInsertions([&](MachineInstr *MI) { + // Only legalize pre-isel generic instructions. + // Legalization process could generate Target specific pseudo + // instructions with generic types. Don't record them + if (isPreISelGenericOpcode(MI->getOpcode())) { + ++NumNewInsns; + WorkList.push_back(MI); } - std::string Msg; - raw_string_ostream OS(Msg); - OS << "unable to legalize instruction: "; - MI->print(OS); - report_fatal_error(OS.str()); - } - - Changed |= Res == LegalizerHelper::Legalized; - } + }); + WorkList.push_back(&*MI); + + bool Changed = false; + LegalizerHelper::LegalizeResult Res; + unsigned Idx = 0; + do { + Res = Helper.legalizeInstrStep(*WorkList[Idx]); + // Error out if we couldn't legalize this instruction. We may want to + // fall back to DAG ISel instead in the future. + if (Res == LegalizerHelper::UnableToLegalize) { + Helper.MIRBuilder.stopRecordingInsertions(); + if (Res == LegalizerHelper::UnableToLegalize) { + reportGISelFailure(MF, TPC, MORE, "gisel-legalize", + "unable to legalize instruction", + *WorkList[Idx]); + return false; + } + } + Changed |= Res == LegalizerHelper::Legalized; + ++Idx; + +#ifndef NDEBUG + if (NumNewInsns) { + DEBUG(dbgs() << ".. .. Emitted " << NumNewInsns << " insns\n"); + for (auto I = WorkList.end() - NumNewInsns, E = WorkList.end(); + I != E; ++I) + DEBUG(dbgs() << ".. .. New MI: "; (*I)->print(dbgs())); + NumNewInsns = 0; + } +#endif + } while (Idx < WorkList.size()); + Helper.MIRBuilder.stopRecordingInsertions(); + } + } MachineRegisterInfo &MRI = MF.getRegInfo(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); @@ -171,8 +207,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) { // Get the next Instruction before we try to legalize, because there's a // good chance MI will be deleted. NextMI = std::next(MI); - - Changed |= combineExtracts(*MI, MRI, TII); + Changed |= combineMerges(*MI, MRI, TII, Helper.MIRBuilder); } } diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index eb25b6c..5258370 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -24,120 +24,174 @@ #include <sstream> -#define DEBUG_TYPE "legalize-mir" +#define DEBUG_TYPE "legalizer" using namespace llvm; LegalizerHelper::LegalizerHelper(MachineFunction &MF) - : MRI(MF.getRegInfo()) { + : MRI(MF.getRegInfo()), LI(*MF.getSubtarget().getLegalizerInfo()) { MIRBuilder.setMF(MF); } LegalizerHelper::LegalizeResult -LegalizerHelper::legalizeInstrStep(MachineInstr &MI, - const LegalizerInfo &LegalizerInfo) { - auto Action = LegalizerInfo.getAction(MI, MRI); +LegalizerHelper::legalizeInstrStep(MachineInstr &MI) { + DEBUG(dbgs() << "Legalizing: "; MI.print(dbgs())); + + auto Action = LI.getAction(MI, MRI); switch (std::get<0>(Action)) { case LegalizerInfo::Legal: + DEBUG(dbgs() << ".. Already legal\n"); return AlreadyLegal; case LegalizerInfo::Libcall: + DEBUG(dbgs() << ".. Convert to libcall\n"); return libcall(MI); case LegalizerInfo::NarrowScalar: + DEBUG(dbgs() << ".. Narrow scalar\n"); return narrowScalar(MI, std::get<1>(Action), std::get<2>(Action)); case LegalizerInfo::WidenScalar: + DEBUG(dbgs() << ".. Widen scalar\n"); return widenScalar(MI, std::get<1>(Action), std::get<2>(Action)); case LegalizerInfo::Lower: + DEBUG(dbgs() << ".. Lower\n"); return lower(MI, std::get<1>(Action), std::get<2>(Action)); case LegalizerInfo::FewerElements: + DEBUG(dbgs() << ".. Reduce number of elements\n"); return fewerElementsVector(MI, std::get<1>(Action), std::get<2>(Action)); + case LegalizerInfo::Custom: + DEBUG(dbgs() << ".. Custom legalization\n"); + return LI.legalizeCustom(MI, MRI, MIRBuilder) ? Legalized + : UnableToLegalize; default: + DEBUG(dbgs() << ".. Unable to legalize\n"); return UnableToLegalize; } } -LegalizerHelper::LegalizeResult -LegalizerHelper::legalizeInstr(MachineInstr &MI, - const LegalizerInfo &LegalizerInfo) { - SmallVector<MachineInstr *, 4> WorkList; - MIRBuilder.recordInsertions( - [&](MachineInstr *MI) { WorkList.push_back(MI); }); - WorkList.push_back(&MI); - - bool Changed = false; - LegalizeResult Res; - unsigned Idx = 0; - do { - Res = legalizeInstrStep(*WorkList[Idx], LegalizerInfo); - if (Res == UnableToLegalize) { - MIRBuilder.stopRecordingInsertions(); - return UnableToLegalize; - } - Changed |= Res == Legalized; - ++Idx; - } while (Idx < WorkList.size()); - - MIRBuilder.stopRecordingInsertions(); - - return Changed ? Legalized : AlreadyLegal; -} - void LegalizerHelper::extractParts(unsigned Reg, LLT Ty, int NumParts, SmallVectorImpl<unsigned> &VRegs) { - unsigned Size = Ty.getSizeInBits(); - SmallVector<uint64_t, 4> Indexes; - for (int i = 0; i < NumParts; ++i) { + for (int i = 0; i < NumParts; ++i) VRegs.push_back(MRI.createGenericVirtualRegister(Ty)); - Indexes.push_back(i * Size); + MIRBuilder.buildUnmerge(VRegs, Reg); +} + +static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { + switch (Opcode) { + case TargetOpcode::G_SDIV: + assert(Size == 32 && "Unsupported size"); + return RTLIB::SDIV_I32; + case TargetOpcode::G_UDIV: + assert(Size == 32 && "Unsupported size"); + return RTLIB::UDIV_I32; + case TargetOpcode::G_SREM: + assert(Size == 32 && "Unsupported size"); + return RTLIB::SREM_I32; + case TargetOpcode::G_UREM: + assert(Size == 32 && "Unsupported size"); + return RTLIB::UREM_I32; + case TargetOpcode::G_FADD: + assert((Size == 32 || Size == 64) && "Unsupported size"); + return Size == 64 ? RTLIB::ADD_F64 : RTLIB::ADD_F32; + case TargetOpcode::G_FREM: + return Size == 64 ? RTLIB::REM_F64 : RTLIB::REM_F32; + case TargetOpcode::G_FPOW: + return Size == 64 ? RTLIB::POW_F64 : RTLIB::POW_F32; } - MIRBuilder.buildExtract(VRegs, Indexes, Reg); + llvm_unreachable("Unknown libcall function"); +} + +LegalizerHelper::LegalizeResult +llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall, + const CallLowering::ArgInfo &Result, + ArrayRef<CallLowering::ArgInfo> Args) { + auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering(); + auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering(); + const char *Name = TLI.getLibcallName(Libcall); + + MIRBuilder.getMF().getFrameInfo().setHasCalls(true); + if (!CLI.lowerCall(MIRBuilder, TLI.getLibcallCallingConv(Libcall), + MachineOperand::CreateES(Name), Result, Args)) + return LegalizerHelper::UnableToLegalize; + + return LegalizerHelper::Legalized; +} + +static LegalizerHelper::LegalizeResult +simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size, + Type *OpType) { + auto Libcall = getRTLibDesc(MI.getOpcode(), Size); + return createLibcall(MIRBuilder, Libcall, {MI.getOperand(0).getReg(), OpType}, + {{MI.getOperand(1).getReg(), OpType}, + {MI.getOperand(2).getReg(), OpType}}); } LegalizerHelper::LegalizeResult LegalizerHelper::libcall(MachineInstr &MI) { - LLT Ty = MRI.getType(MI.getOperand(0).getReg()); - unsigned Size = Ty.getSizeInBits(); + LLT LLTy = MRI.getType(MI.getOperand(0).getReg()); + unsigned Size = LLTy.getSizeInBits(); + auto &Ctx = MIRBuilder.getMF().getFunction()->getContext(); + MIRBuilder.setInstr(MI); switch (MI.getOpcode()) { default: return UnableToLegalize; + case TargetOpcode::G_SDIV: + case TargetOpcode::G_UDIV: + case TargetOpcode::G_SREM: + case TargetOpcode::G_UREM: { + Type *HLTy = Type::getInt32Ty(Ctx); + auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy); + if (Status != Legalized) + return Status; + break; + } + case TargetOpcode::G_FADD: + case TargetOpcode::G_FPOW: case TargetOpcode::G_FREM: { - auto &Ctx = MIRBuilder.getMF().getFunction()->getContext(); - Type *Ty = Size == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx); - auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering(); - auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering(); - const char *Name = - TLI.getLibcallName(Size == 64 ? RTLIB::REM_F64 : RTLIB::REM_F32); - - CLI.lowerCall( - MIRBuilder, MachineOperand::CreateES(Name), - {MI.getOperand(0).getReg(), Ty}, - {{MI.getOperand(1).getReg(), Ty}, {MI.getOperand(2).getReg(), Ty}}); - MI.eraseFromParent(); - return Legalized; + Type *HLTy = Size == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx); + auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy); + if (Status != Legalized) + return Status; + break; } } + + MI.eraseFromParent(); + return Legalized; } LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy) { // FIXME: Don't know how to handle secondary types yet. - if (TypeIdx != 0) + if (TypeIdx != 0 && MI.getOpcode() != TargetOpcode::G_EXTRACT) return UnableToLegalize; + + MIRBuilder.setInstr(MI); + switch (MI.getOpcode()) { default: return UnableToLegalize; + case TargetOpcode::G_IMPLICIT_DEF: { + int NumParts = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / + NarrowTy.getSizeInBits(); + + SmallVector<unsigned, 2> DstRegs; + for (int i = 0; i < NumParts; ++i) { + unsigned Dst = MRI.createGenericVirtualRegister(NarrowTy); + MIRBuilder.buildUndef(Dst); + DstRegs.push_back(Dst); + } + MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs); + MI.eraseFromParent(); + return Legalized; + } case TargetOpcode::G_ADD: { // Expand in terms of carry-setting/consuming G_ADDE instructions. - unsigned NarrowSize = NarrowTy.getSizeInBits(); int NumParts = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowTy.getSizeInBits(); - MIRBuilder.setInstr(MI); - SmallVector<unsigned, 2> Src1Regs, Src2Regs, DstRegs; - SmallVector<uint64_t, 2> Indexes; extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, Src1Regs); extractParts(MI.getOperand(2).getReg(), NarrowTy, NumParts, Src2Regs); @@ -152,11 +206,193 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, Src2Regs[i], CarryIn); DstRegs.push_back(DstReg); - Indexes.push_back(i * NarrowSize); CarryIn = CarryOut; } unsigned DstReg = MI.getOperand(0).getReg(); - MIRBuilder.buildSequence(DstReg, DstRegs, Indexes); + MIRBuilder.buildMerge(DstReg, DstRegs); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_EXTRACT: { + if (TypeIdx != 1) + return UnableToLegalize; + + int64_t NarrowSize = NarrowTy.getSizeInBits(); + int NumParts = + MRI.getType(MI.getOperand(1).getReg()).getSizeInBits() / NarrowSize; + + SmallVector<unsigned, 2> SrcRegs, DstRegs; + SmallVector<uint64_t, 2> Indexes; + extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs); + + unsigned OpReg = MI.getOperand(0).getReg(); + int64_t OpStart = MI.getOperand(2).getImm(); + int64_t OpSize = MRI.getType(OpReg).getSizeInBits(); + for (int i = 0; i < NumParts; ++i) { + unsigned SrcStart = i * NarrowSize; + + if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) { + // No part of the extract uses this subregister, ignore it. + continue; + } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) { + // The entire subregister is extracted, forward the value. + DstRegs.push_back(SrcRegs[i]); + continue; + } + + // OpSegStart is where this destination segment would start in OpReg if it + // extended infinitely in both directions. + int64_t ExtractOffset, SegSize; + if (OpStart < SrcStart) { + ExtractOffset = 0; + SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart); + } else { + ExtractOffset = OpStart - SrcStart; + SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize); + } + + unsigned SegReg = SrcRegs[i]; + if (ExtractOffset != 0 || SegSize != NarrowSize) { + // A genuine extract is needed. + SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize)); + MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset); + } + + DstRegs.push_back(SegReg); + } + + MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_INSERT: { + if (TypeIdx != 0) + return UnableToLegalize; + + int64_t NarrowSize = NarrowTy.getSizeInBits(); + int NumParts = + MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize; + + SmallVector<unsigned, 2> SrcRegs, DstRegs; + SmallVector<uint64_t, 2> Indexes; + extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs); + + unsigned OpReg = MI.getOperand(2).getReg(); + int64_t OpStart = MI.getOperand(3).getImm(); + int64_t OpSize = MRI.getType(OpReg).getSizeInBits(); + for (int i = 0; i < NumParts; ++i) { + unsigned DstStart = i * NarrowSize; + + if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) { + // No part of the insert affects this subregister, forward the original. + DstRegs.push_back(SrcRegs[i]); + continue; + } else if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) { + // The entire subregister is defined by this insert, forward the new + // value. + DstRegs.push_back(OpReg); + continue; + } + + // OpSegStart is where this destination segment would start in OpReg if it + // extended infinitely in both directions. + int64_t ExtractOffset, InsertOffset, SegSize; + if (OpStart < DstStart) { + InsertOffset = 0; + ExtractOffset = DstStart - OpStart; + SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart); + } else { + InsertOffset = OpStart - DstStart; + ExtractOffset = 0; + SegSize = + std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart); + } + + unsigned SegReg = OpReg; + if (ExtractOffset != 0 || SegSize != OpSize) { + // A genuine extract is needed. + SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize)); + MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset); + } + + unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy); + MIRBuilder.buildInsert(DstReg, SrcRegs[i], SegReg, InsertOffset); + DstRegs.push_back(DstReg); + } + + assert(DstRegs.size() == (unsigned)NumParts && "not all parts covered"); + MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_LOAD: { + unsigned NarrowSize = NarrowTy.getSizeInBits(); + int NumParts = + MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize; + LLT OffsetTy = LLT::scalar( + MRI.getType(MI.getOperand(1).getReg()).getScalarSizeInBits()); + + SmallVector<unsigned, 2> DstRegs; + for (int i = 0; i < NumParts; ++i) { + unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy); + unsigned SrcReg = 0; + unsigned Adjustment = i * NarrowSize / 8; + + MIRBuilder.materializeGEP(SrcReg, MI.getOperand(1).getReg(), OffsetTy, + Adjustment); + + // TODO: This is conservatively correct, but we probably want to split the + // memory operands in the future. + MIRBuilder.buildLoad(DstReg, SrcReg, **MI.memoperands_begin()); + + DstRegs.push_back(DstReg); + } + unsigned DstReg = MI.getOperand(0).getReg(); + MIRBuilder.buildMerge(DstReg, DstRegs); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_STORE: { + unsigned NarrowSize = NarrowTy.getSizeInBits(); + int NumParts = + MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize; + LLT OffsetTy = LLT::scalar( + MRI.getType(MI.getOperand(1).getReg()).getScalarSizeInBits()); + + SmallVector<unsigned, 2> SrcRegs; + extractParts(MI.getOperand(0).getReg(), NarrowTy, NumParts, SrcRegs); + + for (int i = 0; i < NumParts; ++i) { + unsigned DstReg = 0; + unsigned Adjustment = i * NarrowSize / 8; + + MIRBuilder.materializeGEP(DstReg, MI.getOperand(1).getReg(), OffsetTy, + Adjustment); + + // TODO: This is conservatively correct, but we probably want to split the + // memory operands in the future. + MIRBuilder.buildStore(SrcRegs[i], DstReg, **MI.memoperands_begin()); + } + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_CONSTANT: { + unsigned NarrowSize = NarrowTy.getSizeInBits(); + int NumParts = + MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize; + const APInt &Cst = MI.getOperand(1).getCImm()->getValue(); + LLVMContext &Ctx = MIRBuilder.getMF().getFunction()->getContext(); + + SmallVector<unsigned, 2> DstRegs; + for (int i = 0; i < NumParts; ++i) { + unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy); + ConstantInt *CI = + ConstantInt::get(Ctx, Cst.lshr(NarrowSize * i).trunc(NarrowSize)); + MIRBuilder.buildConstant(DstReg, *CI); + DstRegs.push_back(DstReg); + } + unsigned DstReg = MI.getOperand(0).getReg(); + MIRBuilder.buildMerge(DstReg, DstRegs); MI.eraseFromParent(); return Legalized; } @@ -175,7 +411,8 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { case TargetOpcode::G_MUL: case TargetOpcode::G_OR: case TargetOpcode::G_XOR: - case TargetOpcode::G_SUB: { + case TargetOpcode::G_SUB: + case TargetOpcode::G_SHL: { // Perform operation at larger width (any extension is fine here, high bits // don't affect the result) and then truncate the result back to the // original type. @@ -195,10 +432,16 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { return Legalized; } case TargetOpcode::G_SDIV: - case TargetOpcode::G_UDIV: { - unsigned ExtOp = MI.getOpcode() == TargetOpcode::G_SDIV - ? TargetOpcode::G_SEXT - : TargetOpcode::G_ZEXT; + case TargetOpcode::G_UDIV: + case TargetOpcode::G_SREM: + case TargetOpcode::G_UREM: + case TargetOpcode::G_ASHR: + case TargetOpcode::G_LSHR: { + unsigned ExtOp = MI.getOpcode() == TargetOpcode::G_SDIV || + MI.getOpcode() == TargetOpcode::G_SREM || + MI.getOpcode() == TargetOpcode::G_ASHR + ? TargetOpcode::G_SEXT + : TargetOpcode::G_ZEXT; unsigned LHSExt = MRI.createGenericVirtualRegister(WideTy); MIRBuilder.buildInstr(ExtOp).addDef(LHSExt).addUse( @@ -218,6 +461,85 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { MI.eraseFromParent(); return Legalized; } + case TargetOpcode::G_SELECT: { + if (TypeIdx != 0) + return UnableToLegalize; + + // Perform operation at larger width (any extension is fine here, high bits + // don't affect the result) and then truncate the result back to the + // original type. + unsigned Src1Ext = MRI.createGenericVirtualRegister(WideTy); + unsigned Src2Ext = MRI.createGenericVirtualRegister(WideTy); + MIRBuilder.buildAnyExt(Src1Ext, MI.getOperand(2).getReg()); + MIRBuilder.buildAnyExt(Src2Ext, MI.getOperand(3).getReg()); + + unsigned DstExt = MRI.createGenericVirtualRegister(WideTy); + MIRBuilder.buildInstr(TargetOpcode::G_SELECT) + .addDef(DstExt) + .addReg(MI.getOperand(1).getReg()) + .addUse(Src1Ext) + .addUse(Src2Ext); + + MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_FPTOSI: + case TargetOpcode::G_FPTOUI: { + if (TypeIdx != 0) + return UnableToLegalize; + + unsigned DstExt = MRI.createGenericVirtualRegister(WideTy); + MIRBuilder.buildInstr(MI.getOpcode()) + .addDef(DstExt) + .addUse(MI.getOperand(1).getReg()); + + MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_SITOFP: + case TargetOpcode::G_UITOFP: { + if (TypeIdx != 1) + return UnableToLegalize; + + unsigned Src = MI.getOperand(1).getReg(); + unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy); + + if (MI.getOpcode() == TargetOpcode::G_SITOFP) { + MIRBuilder.buildSExt(SrcExt, Src); + } else { + assert(MI.getOpcode() == TargetOpcode::G_UITOFP && "Unexpected conv op"); + MIRBuilder.buildZExt(SrcExt, Src); + } + + MIRBuilder.buildInstr(MI.getOpcode()) + .addDef(MI.getOperand(0).getReg()) + .addUse(SrcExt); + + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_INSERT: { + if (TypeIdx != 0) + return UnableToLegalize; + + unsigned Src = MI.getOperand(1).getReg(); + unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy); + MIRBuilder.buildAnyExt(SrcExt, Src); + + unsigned DstExt = MRI.createGenericVirtualRegister(WideTy); + auto MIB = MIRBuilder.buildInsert(DstExt, SrcExt, MI.getOperand(2).getReg(), + MI.getOperand(3).getImm()); + for (unsigned OpNum = 4; OpNum < MI.getNumOperands(); OpNum += 2) { + MIB.addReg(MI.getOperand(OpNum).getReg()); + MIB.addImm(MI.getOperand(OpNum + 1).getImm()); + } + + MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt); + MI.eraseFromParent(); + return Legalized; + } case TargetOpcode::G_LOAD: { assert(alignTo(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(), 8) == WideTy.getSizeInBits() && @@ -231,12 +553,24 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { return Legalized; } case TargetOpcode::G_STORE: { - assert(alignTo(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(), 8) == - WideTy.getSizeInBits() && - "illegal to increase number of bytes modified by a store"); + if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(1) || + WideTy != LLT::scalar(8)) + return UnableToLegalize; + + auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering(); + auto Content = TLI.getBooleanContents(false, false); + + unsigned ExtOp = TargetOpcode::G_ANYEXT; + if (Content == TargetLoweringBase::ZeroOrOneBooleanContent) + ExtOp = TargetOpcode::G_ZEXT; + else if (Content == TargetLoweringBase::ZeroOrNegativeOneBooleanContent) + ExtOp = TargetOpcode::G_SEXT; + else + ExtOp = TargetOpcode::G_ANYEXT; unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy); - MIRBuilder.buildAnyExt(SrcExt, MI.getOperand(0).getReg()); + MIRBuilder.buildInstr(ExtOp).addDef(SrcExt).addUse( + MI.getOperand(0).getReg()); MIRBuilder.buildStore(SrcExt, MI.getOperand(1).getReg(), **MI.memoperands_begin()); MI.eraseFromParent(); @@ -315,6 +649,83 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { MI.eraseFromParent(); return Legalized; } + case TargetOpcode::G_SMULO: + case TargetOpcode::G_UMULO: { + // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the + // result. + unsigned Res = MI.getOperand(0).getReg(); + unsigned Overflow = MI.getOperand(1).getReg(); + unsigned LHS = MI.getOperand(2).getReg(); + unsigned RHS = MI.getOperand(3).getReg(); + + MIRBuilder.buildMul(Res, LHS, RHS); + + unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO + ? TargetOpcode::G_SMULH + : TargetOpcode::G_UMULH; + + unsigned HiPart = MRI.createGenericVirtualRegister(Ty); + MIRBuilder.buildInstr(Opcode) + .addDef(HiPart) + .addUse(LHS) + .addUse(RHS); + + unsigned Zero = MRI.createGenericVirtualRegister(Ty); + MIRBuilder.buildConstant(Zero, 0); + MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_FNEG: { + // TODO: Handle vector types once we are able to + // represent them. + if (Ty.isVector()) + return UnableToLegalize; + unsigned Res = MI.getOperand(0).getReg(); + Type *ZeroTy; + LLVMContext &Ctx = MIRBuilder.getMF().getFunction()->getContext(); + switch (Ty.getSizeInBits()) { + case 16: + ZeroTy = Type::getHalfTy(Ctx); + break; + case 32: + ZeroTy = Type::getFloatTy(Ctx); + break; + case 64: + ZeroTy = Type::getDoubleTy(Ctx); + break; + default: + llvm_unreachable("unexpected floating-point type"); + } + ConstantFP &ZeroForNegation = + *cast<ConstantFP>(ConstantFP::getZeroValueForNegation(ZeroTy)); + unsigned Zero = MRI.createGenericVirtualRegister(Ty); + MIRBuilder.buildFConstant(Zero, ZeroForNegation); + MIRBuilder.buildInstr(TargetOpcode::G_FSUB) + .addDef(Res) + .addUse(Zero) + .addUse(MI.getOperand(1).getReg()); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_FSUB: { + // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)). + // First, check if G_FNEG is marked as Lower. If so, we may + // end up with an infinite loop as G_FSUB is used to legalize G_FNEG. + if (LI.getAction({G_FNEG, Ty}).first == LegalizerInfo::Lower) + return UnableToLegalize; + unsigned Res = MI.getOperand(0).getReg(); + unsigned LHS = MI.getOperand(1).getReg(); + unsigned RHS = MI.getOperand(2).getReg(); + unsigned Neg = MRI.createGenericVirtualRegister(Ty); + MIRBuilder.buildInstr(TargetOpcode::G_FNEG).addDef(Neg).addUse(RHS); + MIRBuilder.buildInstr(TargetOpcode::G_FADD) + .addDef(Res) + .addUse(LHS) + .addUse(Neg); + MI.eraseFromParent(); + return Legalized; + } } } @@ -335,7 +746,6 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, MIRBuilder.setInstr(MI); SmallVector<unsigned, 2> Src1Regs, Src2Regs, DstRegs; - SmallVector<uint64_t, 2> Indexes; extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, Src1Regs); extractParts(MI.getOperand(2).getReg(), NarrowTy, NumParts, Src2Regs); @@ -343,10 +753,9 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy); MIRBuilder.buildAdd(DstReg, Src1Regs[i], Src2Regs[i]); DstRegs.push_back(DstReg); - Indexes.push_back(i * NarrowSize); } - MIRBuilder.buildSequence(DstReg, DstRegs, Indexes); + MIRBuilder.buildMerge(DstReg, DstRegs); MI.eraseFromParent(); return Legalized; } diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp index e496620..76917aa 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp @@ -1,4 +1,4 @@ -//===---- lib/CodeGen/GlobalISel/LegalizerInfo.cpp - Legalizer -------==// +//===- lib/CodeGen/GlobalISel/LegalizerInfo.cpp - Legalizer ---------------===// // // The LLVM Compiler Infrastructure // @@ -18,16 +18,25 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" - #include "llvm/ADT/SmallBitVector.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/ValueTypes.h" -#include "llvm/IR/Type.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/LowLevelTypeImpl.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOpcodes.h" +#include <algorithm> +#include <cassert> +#include <tuple> +#include <utility> + using namespace llvm; -LegalizerInfo::LegalizerInfo() : TablesInitialized(false) { +LegalizerInfo::LegalizerInfo() { + DefaultActions[TargetOpcode::G_IMPLICIT_DEF] = NarrowScalar; + // FIXME: these two can be legalized to the fundamental load/store Jakob // proposed. Once loads & stores are supported. DefaultActions[TargetOpcode::G_ANYEXT] = Legal; @@ -41,6 +50,9 @@ LegalizerInfo::LegalizerInfo() : TablesInitialized(false) { DefaultActions[TargetOpcode::G_STORE] = NarrowScalar; DefaultActions[TargetOpcode::G_BRCOND] = WidenScalar; + DefaultActions[TargetOpcode::G_INSERT] = NarrowScalar; + DefaultActions[TargetOpcode::G_EXTRACT] = NarrowScalar; + DefaultActions[TargetOpcode::G_FNEG] = Lower; } void LegalizerInfo::computeTables() { @@ -71,28 +83,34 @@ LegalizerInfo::getAction(const InstrAspect &Aspect) const { // These *have* to be implemented for now, they're the fundamental basis of // how everything else is transformed. - // Nothing is going to go well with types that aren't a power of 2 yet, so - // don't even try because we might make things worse. - if (!isPowerOf2_64(Aspect.Type.getSizeInBits())) - return std::make_pair(Unsupported, LLT()); - // FIXME: the long-term plan calls for expansion in terms of load/store (if // they're not legal). - if (Aspect.Opcode == TargetOpcode::G_SEQUENCE || - Aspect.Opcode == TargetOpcode::G_EXTRACT) + if (Aspect.Opcode == TargetOpcode::G_MERGE_VALUES || + Aspect.Opcode == TargetOpcode::G_UNMERGE_VALUES) return std::make_pair(Legal, Aspect.Type); + LLT Ty = Aspect.Type; LegalizeAction Action = findInActions(Aspect); + // LegalizerHelper is not able to handle non-power-of-2 types right now, so do + // not try to legalize them unless they are marked as Legal or Custom. + // FIXME: This is a temporary hack until the general non-power-of-2 + // legalization works. + if (!isPowerOf2_64(Ty.getSizeInBits()) && + !(Action == Legal || Action == Custom)) + return std::make_pair(Unsupported, LLT()); + if (Action != NotFound) return findLegalAction(Aspect, Action); unsigned Opcode = Aspect.Opcode; - LLT Ty = Aspect.Type; if (!Ty.isVector()) { auto DefaultAction = DefaultActions.find(Aspect.Opcode); if (DefaultAction != DefaultActions.end() && DefaultAction->second == Legal) return std::make_pair(Legal, Ty); + if (DefaultAction != DefaultActions.end() && DefaultAction->second == Lower) + return std::make_pair(Lower, Ty); + if (DefaultAction == DefaultActions.end() || DefaultAction->second != NarrowScalar) return std::make_pair(Unsupported, LLT()); @@ -152,7 +170,7 @@ bool LegalizerInfo::isLegal(const MachineInstr &MI, return std::get<0>(getAction(MI, MRI)) == Legal; } -LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect, +Optional<LLT> LegalizerInfo::findLegalType(const InstrAspect &Aspect, LegalizeAction Action) const { switch(Action) { default: @@ -160,23 +178,30 @@ LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect, case Legal: case Lower: case Libcall: + case Custom: return Aspect.Type; case NarrowScalar: { - return findLegalType(Aspect, - [&](LLT Ty) -> LLT { return Ty.halfScalarSize(); }); + return findLegalizableSize( + Aspect, [&](LLT Ty) -> LLT { return Ty.halfScalarSize(); }); } case WidenScalar: { - return findLegalType(Aspect, [&](LLT Ty) -> LLT { + return findLegalizableSize(Aspect, [&](LLT Ty) -> LLT { return Ty.getSizeInBits() < 8 ? LLT::scalar(8) : Ty.doubleScalarSize(); }); } case FewerElements: { - return findLegalType(Aspect, - [&](LLT Ty) -> LLT { return Ty.halfElements(); }); + return findLegalizableSize( + Aspect, [&](LLT Ty) -> LLT { return Ty.halfElements(); }); } case MoreElements: { - return findLegalType(Aspect, - [&](LLT Ty) -> LLT { return Ty.doubleElements(); }); + return findLegalizableSize( + Aspect, [&](LLT Ty) -> LLT { return Ty.doubleElements(); }); } } } + +bool LegalizerInfo::legalizeCustom(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const { + return false; +} diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/Localizer.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/Localizer.cpp new file mode 100644 index 0000000..c5d0999 --- /dev/null +++ b/contrib/llvm/lib/CodeGen/GlobalISel/Localizer.cpp @@ -0,0 +1,123 @@ +//===- Localizer.cpp ---------------------- Localize some instrs -*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the Localizer class. +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/GlobalISel/Localizer.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "localizer" + +using namespace llvm; + +char Localizer::ID = 0; +INITIALIZE_PASS(Localizer, DEBUG_TYPE, + "Move/duplicate certain instructions close to their use", false, + false) + +Localizer::Localizer() : MachineFunctionPass(ID) { + initializeLocalizerPass(*PassRegistry::getPassRegistry()); +} + +void Localizer::init(MachineFunction &MF) { MRI = &MF.getRegInfo(); } + +bool Localizer::shouldLocalize(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + return false; + // Constants-like instructions should be close to their users. + // We don't want long live-ranges for them. + case TargetOpcode::G_CONSTANT: + case TargetOpcode::G_FCONSTANT: + case TargetOpcode::G_FRAME_INDEX: + return true; + } +} + +bool Localizer::isLocalUse(MachineOperand &MOUse, const MachineInstr &Def, + MachineBasicBlock *&InsertMBB) { + MachineInstr &MIUse = *MOUse.getParent(); + InsertMBB = MIUse.getParent(); + if (MIUse.isPHI()) + InsertMBB = MIUse.getOperand(MIUse.getOperandNo(&MOUse) + 1).getMBB(); + return InsertMBB == Def.getParent(); +} + +bool Localizer::runOnMachineFunction(MachineFunction &MF) { + // If the ISel pipeline failed, do not bother running that pass. + if (MF.getProperties().hasProperty( + MachineFunctionProperties::Property::FailedISel)) + return false; + + DEBUG(dbgs() << "Localize instructions for: " << MF.getName() << '\n'); + + init(MF); + + bool Changed = false; + // Keep track of the instructions we localized. + // We won't need to process them if we see them later in the CFG. + SmallPtrSet<MachineInstr *, 16> LocalizedInstrs; + DenseMap<std::pair<MachineBasicBlock *, unsigned>, unsigned> MBBWithLocalDef; + // TODO: Do bottom up traversal. + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (LocalizedInstrs.count(&MI) || !shouldLocalize(MI)) + continue; + DEBUG(dbgs() << "Should localize: " << MI); + assert(MI.getDesc().getNumDefs() == 1 && + "More than one definition not supported yet"); + unsigned Reg = MI.getOperand(0).getReg(); + // Check if all the users of MI are local. + // We are going to invalidation the list of use operands, so we + // can't use range iterator. + for (auto MOIt = MRI->use_begin(Reg), MOItEnd = MRI->use_end(); + MOIt != MOItEnd;) { + MachineOperand &MOUse = *MOIt++; + // Check if the use is already local. + MachineBasicBlock *InsertMBB; + DEBUG(MachineInstr &MIUse = *MOUse.getParent(); + dbgs() << "Checking use: " << MIUse + << " #Opd: " << MIUse.getOperandNo(&MOUse) << '\n'); + if (isLocalUse(MOUse, MI, InsertMBB)) + continue; + DEBUG(dbgs() << "Fixing non-local use\n"); + Changed = true; + auto MBBAndReg = std::make_pair(InsertMBB, Reg); + auto NewVRegIt = MBBWithLocalDef.find(MBBAndReg); + if (NewVRegIt == MBBWithLocalDef.end()) { + // Create the localized instruction. + MachineInstr *LocalizedMI = MF.CloneMachineInstr(&MI); + LocalizedInstrs.insert(LocalizedMI); + // Don't try to be smart for the insertion point. + // There is no guarantee that the first seen use is the first + // use in the block. + InsertMBB->insert(InsertMBB->getFirstNonPHI(), LocalizedMI); + + // Set a new register for the definition. + unsigned NewReg = + MRI->createGenericVirtualRegister(MRI->getType(Reg)); + MRI->setRegClassOrRegBank(NewReg, MRI->getRegClassOrRegBank(Reg)); + LocalizedMI->getOperand(0).setReg(NewReg); + NewVRegIt = + MBBWithLocalDef.insert(std::make_pair(MBBAndReg, NewReg)).first; + DEBUG(dbgs() << "Inserted: " << *LocalizedMI); + } + DEBUG(dbgs() << "Update use with: " << PrintReg(NewVRegIt->second) + << '\n'); + // Update the user reg. + MOUse.setReg(NewVRegIt->second); + } + } + } + return Changed; +} diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index c04f6e4..4636806 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -15,6 +15,7 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetOpcodes.h" #include "llvm/Target/TargetSubtargetInfo.h" @@ -54,7 +55,7 @@ void MachineIRBuilder::setInsertPt(MachineBasicBlock &MBB, void MachineIRBuilder::recordInsertions( std::function<void(MachineInstr *)> Inserted) { - InsertedInstr = Inserted; + InsertedInstr = std::move(Inserted); } void MachineIRBuilder::stopRecordingInsertions() { @@ -82,6 +83,70 @@ MachineInstrBuilder MachineIRBuilder::insertInstr(MachineInstrBuilder MIB) { return MIB; } +MachineInstrBuilder MachineIRBuilder::buildDirectDbgValue( + unsigned Reg, const MDNode *Variable, const MDNode *Expr) { + assert(isa<DILocalVariable>(Variable) && "not a variable"); + assert(cast<DIExpression>(Expr)->isValid() && "not an expression"); + assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) && + "Expected inlined-at fields to agree"); + return buildInstr(TargetOpcode::DBG_VALUE) + .addReg(Reg, RegState::Debug) + .addReg(0, RegState::Debug) + .addMetadata(Variable) + .addMetadata(Expr); +} + +MachineInstrBuilder MachineIRBuilder::buildIndirectDbgValue( + unsigned Reg, unsigned Offset, const MDNode *Variable, const MDNode *Expr) { + assert(isa<DILocalVariable>(Variable) && "not a variable"); + assert(cast<DIExpression>(Expr)->isValid() && "not an expression"); + assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) && + "Expected inlined-at fields to agree"); + return buildInstr(TargetOpcode::DBG_VALUE) + .addReg(Reg, RegState::Debug) + .addImm(Offset) + .addMetadata(Variable) + .addMetadata(Expr); +} + +MachineInstrBuilder MachineIRBuilder::buildFIDbgValue(int FI, + const MDNode *Variable, + const MDNode *Expr) { + assert(isa<DILocalVariable>(Variable) && "not a variable"); + assert(cast<DIExpression>(Expr)->isValid() && "not an expression"); + assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) && + "Expected inlined-at fields to agree"); + return buildInstr(TargetOpcode::DBG_VALUE) + .addFrameIndex(FI) + .addImm(0) + .addMetadata(Variable) + .addMetadata(Expr); +} + +MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C, + unsigned Offset, + const MDNode *Variable, + const MDNode *Expr) { + assert(isa<DILocalVariable>(Variable) && "not a variable"); + assert(cast<DIExpression>(Expr)->isValid() && "not an expression"); + assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) && + "Expected inlined-at fields to agree"); + auto MIB = buildInstr(TargetOpcode::DBG_VALUE); + if (auto *CI = dyn_cast<ConstantInt>(&C)) { + if (CI->getBitWidth() > 64) + MIB.addCImm(CI); + else + MIB.addImm(CI->getZExtValue()); + } else if (auto *CFP = dyn_cast<ConstantFP>(&C)) { + MIB.addFPImm(CFP); + } else { + // Insert %noreg if we didn't find a usable constant and had to drop it. + MIB.addReg(0U); + } + + return MIB.addImm(Offset).addMetadata(Variable).addMetadata(Expr); +} + MachineInstrBuilder MachineIRBuilder::buildFrameIndex(unsigned Res, int Idx) { assert(MRI->getType(Res).isPointer() && "invalid operand type"); return buildInstr(TargetOpcode::G_FRAME_INDEX) @@ -101,19 +166,24 @@ MachineInstrBuilder MachineIRBuilder::buildGlobalValue(unsigned Res, .addGlobalAddress(GV); } -MachineInstrBuilder MachineIRBuilder::buildAdd(unsigned Res, unsigned Op0, +MachineInstrBuilder MachineIRBuilder::buildBinaryOp(unsigned Opcode, unsigned Res, unsigned Op0, unsigned Op1) { assert((MRI->getType(Res).isScalar() || MRI->getType(Res).isVector()) && "invalid operand type"); assert(MRI->getType(Res) == MRI->getType(Op0) && MRI->getType(Res) == MRI->getType(Op1) && "type mismatch"); - return buildInstr(TargetOpcode::G_ADD) + return buildInstr(Opcode) .addDef(Res) .addUse(Op0) .addUse(Op1); } +MachineInstrBuilder MachineIRBuilder::buildAdd(unsigned Res, unsigned Op0, + unsigned Op1) { + return buildBinaryOp(TargetOpcode::G_ADD, Res, Op0, Op1); +} + MachineInstrBuilder MachineIRBuilder::buildGEP(unsigned Res, unsigned Op0, unsigned Op1) { assert(MRI->getType(Res).isPointer() && @@ -126,37 +196,67 @@ MachineInstrBuilder MachineIRBuilder::buildGEP(unsigned Res, unsigned Op0, .addUse(Op1); } -MachineInstrBuilder MachineIRBuilder::buildSub(unsigned Res, unsigned Op0, - unsigned Op1) { - assert((MRI->getType(Res).isScalar() || MRI->getType(Res).isVector()) && - "invalid operand type"); - assert(MRI->getType(Res) == MRI->getType(Op0) && - MRI->getType(Res) == MRI->getType(Op1) && "type mismatch"); +Optional<MachineInstrBuilder> +MachineIRBuilder::materializeGEP(unsigned &Res, unsigned Op0, + const LLT &ValueTy, uint64_t Value) { + assert(Res == 0 && "Res is a result argument"); + assert(ValueTy.isScalar() && "invalid offset type"); - return buildInstr(TargetOpcode::G_SUB) + if (Value == 0) { + Res = Op0; + return None; + } + + Res = MRI->createGenericVirtualRegister(MRI->getType(Op0)); + unsigned TmpReg = MRI->createGenericVirtualRegister(ValueTy); + + buildConstant(TmpReg, Value); + return buildGEP(Res, Op0, TmpReg); +} + +MachineInstrBuilder MachineIRBuilder::buildPtrMask(unsigned Res, unsigned Op0, + uint32_t NumBits) { + assert(MRI->getType(Res).isPointer() && + MRI->getType(Res) == MRI->getType(Op0) && "type mismatch"); + + return buildInstr(TargetOpcode::G_PTR_MASK) .addDef(Res) .addUse(Op0) - .addUse(Op1); + .addImm(NumBits); +} + +MachineInstrBuilder MachineIRBuilder::buildSub(unsigned Res, unsigned Op0, + unsigned Op1) { + return buildBinaryOp(TargetOpcode::G_SUB, Res, Op0, Op1); } MachineInstrBuilder MachineIRBuilder::buildMul(unsigned Res, unsigned Op0, unsigned Op1) { - assert((MRI->getType(Res).isScalar() || MRI->getType(Res).isVector()) && - "invalid operand type"); - assert(MRI->getType(Res) == MRI->getType(Op0) && - MRI->getType(Res) == MRI->getType(Op1) && "type mismatch"); + return buildBinaryOp(TargetOpcode::G_MUL, Res, Op0, Op1); +} - return buildInstr(TargetOpcode::G_MUL) - .addDef(Res) - .addUse(Op0) - .addUse(Op1); +MachineInstrBuilder MachineIRBuilder::buildAnd(unsigned Res, unsigned Op0, + unsigned Op1) { + return buildBinaryOp(TargetOpcode::G_AND, Res, Op0, Op1); +} + +MachineInstrBuilder MachineIRBuilder::buildOr(unsigned Res, unsigned Op0, + unsigned Op1) { + return buildBinaryOp(TargetOpcode::G_OR, Res, Op0, Op1); } MachineInstrBuilder MachineIRBuilder::buildBr(MachineBasicBlock &Dest) { return buildInstr(TargetOpcode::G_BR).addMBB(&Dest); } +MachineInstrBuilder MachineIRBuilder::buildBrIndirect(unsigned Tgt) { + assert(MRI->getType(Tgt).isPointer() && "invalid branch destination"); + return buildInstr(TargetOpcode::G_BRINDIRECT).addUse(Tgt); +} + MachineInstrBuilder MachineIRBuilder::buildCopy(unsigned Res, unsigned Op) { + assert(MRI->getType(Res) == LLT() || MRI->getType(Op) == LLT() || + MRI->getType(Res) == MRI->getType(Op)); return buildInstr(TargetOpcode::COPY).addDef(Res).addUse(Op); } @@ -253,49 +353,78 @@ MachineInstrBuilder MachineIRBuilder::buildZExt(unsigned Res, unsigned Op) { MachineInstrBuilder MachineIRBuilder::buildSExtOrTrunc(unsigned Res, unsigned Op) { + assert(MRI->getType(Res).isScalar() || MRI->getType(Res).isVector()); + assert(MRI->getType(Res).isScalar() == MRI->getType(Op).isScalar()); + unsigned Opcode = TargetOpcode::COPY; if (MRI->getType(Res).getSizeInBits() > MRI->getType(Op).getSizeInBits()) Opcode = TargetOpcode::G_SEXT; else if (MRI->getType(Res).getSizeInBits() < MRI->getType(Op).getSizeInBits()) Opcode = TargetOpcode::G_TRUNC; + else + assert(MRI->getType(Res) == MRI->getType(Op)); return buildInstr(Opcode).addDef(Res).addUse(Op); } -MachineInstrBuilder MachineIRBuilder::buildExtract(ArrayRef<unsigned> Results, - ArrayRef<uint64_t> Indices, - unsigned Src) { -#ifndef NDEBUG - assert(Results.size() == Indices.size() && "inconsistent number of regs"); - assert(!Results.empty() && "invalid trivial extract"); - assert(std::is_sorted(Indices.begin(), Indices.end()) && - "extract offsets must be in ascending order"); +MachineInstrBuilder MachineIRBuilder::buildZExtOrTrunc(unsigned Res, + unsigned Op) { + assert(MRI->getType(Res).isScalar() || MRI->getType(Res).isVector()); + assert(MRI->getType(Res).isScalar() == MRI->getType(Op).isScalar()); - assert(MRI->getType(Src).isValid() && "invalid operand type"); - for (auto Res : Results) - assert(MRI->getType(Res).isValid() && "invalid operand type"); -#endif + unsigned Opcode = TargetOpcode::COPY; + if (MRI->getType(Res).getSizeInBits() > MRI->getType(Op).getSizeInBits()) + Opcode = TargetOpcode::G_ZEXT; + else if (MRI->getType(Res).getSizeInBits() < MRI->getType(Op).getSizeInBits()) + Opcode = TargetOpcode::G_TRUNC; + else + assert(MRI->getType(Res) == MRI->getType(Op)); - auto MIB = BuildMI(getMF(), DL, getTII().get(TargetOpcode::G_EXTRACT)); - for (auto Res : Results) - MIB.addDef(Res); + return buildInstr(Opcode).addDef(Res).addUse(Op); +} - MIB.addUse(Src); +MachineInstrBuilder MachineIRBuilder::buildCast(unsigned Dst, unsigned Src) { + LLT SrcTy = MRI->getType(Src); + LLT DstTy = MRI->getType(Dst); + if (SrcTy == DstTy) + return buildCopy(Dst, Src); + + unsigned Opcode; + if (SrcTy.isPointer() && DstTy.isScalar()) + Opcode = TargetOpcode::G_PTRTOINT; + else if (DstTy.isPointer() && SrcTy.isScalar()) + Opcode = TargetOpcode::G_INTTOPTR; + else { + assert(!SrcTy.isPointer() && !DstTy.isPointer() && "n G_ADDRCAST yet"); + Opcode = TargetOpcode::G_BITCAST; + } - for (auto Idx : Indices) - MIB.addImm(Idx); + return buildInstr(Opcode).addDef(Dst).addUse(Src); +} - getMBB().insert(getInsertPt(), MIB); - if (InsertedInstr) - InsertedInstr(MIB); +MachineInstrBuilder MachineIRBuilder::buildExtract(unsigned Res, unsigned Src, + uint64_t Index) { +#ifndef NDEBUG + assert(MRI->getType(Src).isValid() && "invalid operand type"); + assert(MRI->getType(Res).isValid() && "invalid operand type"); + assert(Index + MRI->getType(Res).getSizeInBits() <= + MRI->getType(Src).getSizeInBits() && + "extracting off end of register"); +#endif - return MIB; + if (MRI->getType(Res).getSizeInBits() == MRI->getType(Src).getSizeInBits()) { + assert(Index == 0 && "insertion past the end of a register"); + return buildCast(Res, Src); + } + + return buildInstr(TargetOpcode::G_EXTRACT) + .addDef(Res) + .addUse(Src) + .addImm(Index); } -MachineInstrBuilder -MachineIRBuilder::buildSequence(unsigned Res, - ArrayRef<unsigned> Ops, - ArrayRef<uint64_t> Indices) { +void MachineIRBuilder::buildSequence(unsigned Res, ArrayRef<unsigned> Ops, + ArrayRef<uint64_t> Indices) { #ifndef NDEBUG assert(Ops.size() == Indices.size() && "incompatible args"); assert(!Ops.empty() && "invalid trivial sequence"); @@ -307,15 +436,97 @@ MachineIRBuilder::buildSequence(unsigned Res, assert(MRI->getType(Op).isValid() && "invalid operand type"); #endif - MachineInstrBuilder MIB = buildInstr(TargetOpcode::G_SEQUENCE); - MIB.addDef(Res); + LLT ResTy = MRI->getType(Res); + LLT OpTy = MRI->getType(Ops[0]); + unsigned OpSize = OpTy.getSizeInBits(); + bool MaybeMerge = true; for (unsigned i = 0; i < Ops.size(); ++i) { - MIB.addUse(Ops[i]); - MIB.addImm(Indices[i]); + if (MRI->getType(Ops[i]) != OpTy || Indices[i] != i * OpSize) { + MaybeMerge = false; + break; + } + } + + if (MaybeMerge && Ops.size() * OpSize == ResTy.getSizeInBits()) { + buildMerge(Res, Ops); + return; } + + unsigned ResIn = MRI->createGenericVirtualRegister(ResTy); + buildUndef(ResIn); + + for (unsigned i = 0; i < Ops.size(); ++i) { + unsigned ResOut = + i + 1 == Ops.size() ? Res : MRI->createGenericVirtualRegister(ResTy); + buildInsert(ResOut, ResIn, Ops[i], Indices[i]); + ResIn = ResOut; + } +} + +MachineInstrBuilder MachineIRBuilder::buildUndef(unsigned Res) { + return buildInstr(TargetOpcode::G_IMPLICIT_DEF).addDef(Res); +} + +MachineInstrBuilder MachineIRBuilder::buildMerge(unsigned Res, + ArrayRef<unsigned> Ops) { + +#ifndef NDEBUG + assert(!Ops.empty() && "invalid trivial sequence"); + LLT Ty = MRI->getType(Ops[0]); + for (auto Reg : Ops) + assert(MRI->getType(Reg) == Ty && "type mismatch in input list"); + assert(Ops.size() * MRI->getType(Ops[0]).getSizeInBits() == + MRI->getType(Res).getSizeInBits() && + "input operands do not cover output register"); +#endif + + if (Ops.size() == 1) + return buildCast(Res, Ops[0]); + + MachineInstrBuilder MIB = buildInstr(TargetOpcode::G_MERGE_VALUES); + MIB.addDef(Res); + for (unsigned i = 0; i < Ops.size(); ++i) + MIB.addUse(Ops[i]); return MIB; } +MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef<unsigned> Res, + unsigned Op) { + +#ifndef NDEBUG + assert(!Res.empty() && "invalid trivial sequence"); + LLT Ty = MRI->getType(Res[0]); + for (auto Reg : Res) + assert(MRI->getType(Reg) == Ty && "type mismatch in input list"); + assert(Res.size() * MRI->getType(Res[0]).getSizeInBits() == + MRI->getType(Op).getSizeInBits() && + "input operands do not cover output register"); +#endif + + MachineInstrBuilder MIB = buildInstr(TargetOpcode::G_UNMERGE_VALUES); + for (unsigned i = 0; i < Res.size(); ++i) + MIB.addDef(Res[i]); + MIB.addUse(Op); + return MIB; +} + +MachineInstrBuilder MachineIRBuilder::buildInsert(unsigned Res, unsigned Src, + unsigned Op, unsigned Index) { + assert(Index + MRI->getType(Op).getSizeInBits() <= + MRI->getType(Res).getSizeInBits() && + "insertion past the end of a register"); + + if (MRI->getType(Res).getSizeInBits() == MRI->getType(Op).getSizeInBits()) { + return buildCast(Res, Op); + } + + return buildInstr(TargetOpcode::G_INSERT) + .addDef(Res) + .addUse(Src) + .addUse(Op) + .addImm(Index); +} + MachineInstrBuilder MachineIRBuilder::buildIntrinsic(Intrinsic::ID ID, unsigned Res, bool HasSideEffects) { @@ -395,9 +606,10 @@ MachineInstrBuilder MachineIRBuilder::buildSelect(unsigned Res, unsigned Tst, if (ResTy.isScalar() || ResTy.isPointer()) assert(MRI->getType(Tst).isScalar() && "type mismatch"); else - assert(MRI->getType(Tst).isVector() && - MRI->getType(Tst).getNumElements() == - MRI->getType(Op0).getNumElements() && + assert((MRI->getType(Tst).isScalar() || + (MRI->getType(Tst).isVector() && + MRI->getType(Tst).getNumElements() == + MRI->getType(Op0).getNumElements())) && "type mismatch"); #endif @@ -408,6 +620,47 @@ MachineInstrBuilder MachineIRBuilder::buildSelect(unsigned Res, unsigned Tst, .addUse(Op1); } +MachineInstrBuilder MachineIRBuilder::buildInsertVectorElement(unsigned Res, + unsigned Val, + unsigned Elt, + unsigned Idx) { +#ifndef NDEBUG + LLT ResTy = MRI->getType(Res); + LLT ValTy = MRI->getType(Val); + LLT EltTy = MRI->getType(Elt); + LLT IdxTy = MRI->getType(Idx); + assert(ResTy.isVector() && ValTy.isVector() && "invalid operand type"); + assert(IdxTy.isScalar() && "invalid operand type"); + assert(ResTy.getNumElements() == ValTy.getNumElements() && "type mismatch"); + assert(ResTy.getElementType() == EltTy && "type mismatch"); +#endif + + return buildInstr(TargetOpcode::G_INSERT_VECTOR_ELT) + .addDef(Res) + .addUse(Val) + .addUse(Elt) + .addUse(Idx); +} + +MachineInstrBuilder MachineIRBuilder::buildExtractVectorElement(unsigned Res, + unsigned Val, + unsigned Idx) { +#ifndef NDEBUG + LLT ResTy = MRI->getType(Res); + LLT ValTy = MRI->getType(Val); + LLT IdxTy = MRI->getType(Idx); + assert(ValTy.isVector() && "invalid operand type"); + assert((ResTy.isScalar() || ResTy.isPointer()) && "invalid operand type"); + assert(IdxTy.isScalar() && "invalid operand type"); + assert(ValTy.getElementType() == ResTy && "type mismatch"); +#endif + + return buildInstr(TargetOpcode::G_EXTRACT_VECTOR_ELT) + .addDef(Res) + .addUse(Val) + .addUse(Idx); +} + void MachineIRBuilder::validateTruncExt(unsigned Dst, unsigned Src, bool IsExtend) { #ifndef NDEBUG diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp index cc026ef..677941d 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp @@ -1,4 +1,4 @@ -//===- llvm/CodeGen/GlobalISel/RegBankSelect.cpp - RegBankSelect -*- C++ -*-==// +//==- llvm/CodeGen/GlobalISel/RegBankSelect.cpp - RegBankSelect --*- C++ -*-==// // // The LLVM Compiler Infrastructure // @@ -12,17 +12,39 @@ #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/GlobalISel/RegisterBank.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Function.h" +#include "llvm/IR/Attributes.h" +#include "llvm/Pass.h" #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOpcodes.h" +#include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <limits> +#include <memory> +#include <utility> #define DEBUG_TYPE "regbankselect" @@ -36,6 +58,7 @@ static cl::opt<RegBankSelect::Mode> RegBankSelectMode( "Use the Greedy mode (best local mapping)"))); char RegBankSelect::ID = 0; + INITIALIZE_PASS_BEGIN(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false); @@ -47,8 +70,7 @@ INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, false) RegBankSelect::RegBankSelect(Mode RunningMode) - : MachineFunctionPass(ID), RBI(nullptr), MRI(nullptr), TRI(nullptr), - MBFI(nullptr), MBPI(nullptr), OptMode(RunningMode) { + : MachineFunctionPass(ID), OptMode(RunningMode) { initializeRegBankSelectPass(*PassRegistry::getPassRegistry()); if (RegBankSelectMode.getNumOccurrences() != 0) { OptMode = RegBankSelectMode; @@ -71,6 +93,7 @@ void RegBankSelect::init(MachineFunction &MF) { MBPI = nullptr; } MIRBuilder.setMF(MF); + MORE = llvm::make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI); } void RegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const { @@ -131,9 +154,11 @@ bool RegBankSelect::repairReg( TargetRegisterInfo::isPhysicalRegister(Dst)) && "We are about to create several defs for Dst"); - // Build the instruction used to repair, then clone it at the right places. - MachineInstr *MI = MIRBuilder.buildCopy(Dst, Src); - MI->removeFromParent(); + // Build the instruction used to repair, then clone it at the right + // places. Avoiding buildCopy bypasses the check that Src and Dst have the + // same types because the type is a placeholder when this function is called. + MachineInstr *MI = + MIRBuilder.buildInstrNoInsert(TargetOpcode::COPY).addDef(Dst).addUse(Src); DEBUG(dbgs() << "Copy: " << PrintReg(Src) << " to: " << PrintReg(Dst) << '\n'); // TODO: @@ -200,32 +225,30 @@ uint64_t RegBankSelect::getRepairCost( RBI->copyCost(*DesiredRegBrank, *CurRegBank, RegisterBankInfo::getSizeInBits(MO.getReg(), *MRI, *TRI)); // TODO: use a dedicated constant for ImpossibleCost. - if (Cost != UINT_MAX) + if (Cost != std::numeric_limits<unsigned>::max()) return Cost; - assert(!TPC->isGlobalISelAbortEnabled() && - "Legalization not available yet"); // Return the legalization cost of that repairing. } - assert(!TPC->isGlobalISelAbortEnabled() && - "Complex repairing not implemented yet"); - return UINT_MAX; + return std::numeric_limits<unsigned>::max(); } -RegisterBankInfo::InstructionMapping &RegBankSelect::findBestMapping( +const RegisterBankInfo::InstructionMapping &RegBankSelect::findBestMapping( MachineInstr &MI, RegisterBankInfo::InstructionMappings &PossibleMappings, SmallVectorImpl<RepairingPlacement> &RepairPts) { assert(!PossibleMappings.empty() && "Do not know how to map this instruction"); - RegisterBankInfo::InstructionMapping *BestMapping = nullptr; + const RegisterBankInfo::InstructionMapping *BestMapping = nullptr; MappingCost Cost = MappingCost::ImpossibleCost(); SmallVector<RepairingPlacement, 4> LocalRepairPts; - for (RegisterBankInfo::InstructionMapping &CurMapping : PossibleMappings) { - MappingCost CurCost = computeMapping(MI, CurMapping, LocalRepairPts, &Cost); + for (const RegisterBankInfo::InstructionMapping *CurMapping : + PossibleMappings) { + MappingCost CurCost = + computeMapping(MI, *CurMapping, LocalRepairPts, &Cost); if (CurCost < Cost) { DEBUG(dbgs() << "New best: " << CurCost << '\n'); Cost = CurCost; - BestMapping = &CurMapping; + BestMapping = CurMapping; RepairPts.clear(); for (RepairingPlacement &RepairPt : LocalRepairPts) RepairPts.emplace_back(std::move(RepairPt)); @@ -235,7 +258,7 @@ RegisterBankInfo::InstructionMapping &RegBankSelect::findBestMapping( // If none of the mapping worked that means they are all impossible. // Thus, pick the first one and set an impossible repairing point. // It will trigger the failed isel mode. - BestMapping = &(*PossibleMappings.begin()); + BestMapping = *PossibleMappings.begin(); RepairPts.emplace_back( RepairingPlacement(MI, 0, *TRI, *this, RepairingPlacement::Impossible)); } else @@ -352,7 +375,7 @@ void RegBankSelect::tryAvoidingSplit( // the repairing cost because of the PHIs already proceeded // as already stated. // Though the code will be correct. - assert(0 && "Repairing cost may not be accurate"); + assert(false && "Repairing cost may not be accurate"); } else { // We need to do non-local repairing. Basically, patch all // the uses (i.e., phis) that we already proceeded. @@ -448,6 +471,11 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping( // Sums up the repairing cost of MO at each insertion point. uint64_t RepairCost = getRepairCost(MO, ValMapping); + + // This is an impossible to repair cost. + if (RepairCost == std::numeric_limits<unsigned>::max()) + continue; + // Bias used for splitting: 5%. const uint64_t PercentageForBias = 5; uint64_t Bias = (RepairCost * PercentageForBias + 99) / 100; @@ -530,9 +558,11 @@ bool RegBankSelect::applyMapping( llvm_unreachable("Other kind should not happen"); } } + // Second, rewrite the instruction. DEBUG(dbgs() << "Actual mapping of the operands: " << OpdMapper << '\n'); RBI->applyMapping(OpdMapper); + return true; } @@ -541,10 +571,10 @@ bool RegBankSelect::assignInstr(MachineInstr &MI) { // Remember the repairing placement for all the operands. SmallVector<RepairingPlacement, 4> RepairPts; - RegisterBankInfo::InstructionMapping BestMapping; + const RegisterBankInfo::InstructionMapping *BestMapping; if (OptMode == RegBankSelect::Mode::Fast) { - BestMapping = RBI->getInstrMapping(MI); - MappingCost DefaultCost = computeMapping(MI, BestMapping, RepairPts); + BestMapping = &RBI->getInstrMapping(MI); + MappingCost DefaultCost = computeMapping(MI, *BestMapping, RepairPts); (void)DefaultCost; if (DefaultCost == MappingCost::ImpossibleCost()) return false; @@ -553,16 +583,16 @@ bool RegBankSelect::assignInstr(MachineInstr &MI) { RBI->getInstrPossibleMappings(MI); if (PossibleMappings.empty()) return false; - BestMapping = std::move(findBestMapping(MI, PossibleMappings, RepairPts)); + BestMapping = &findBestMapping(MI, PossibleMappings, RepairPts); } // Make sure the mapping is valid for MI. - assert(BestMapping.verify(MI) && "Invalid instruction mapping"); + assert(BestMapping->verify(MI) && "Invalid instruction mapping"); - DEBUG(dbgs() << "Best Mapping: " << BestMapping << '\n'); + DEBUG(dbgs() << "Best Mapping: " << *BestMapping << '\n'); // After this call, MI may not be valid anymore. // Do not use it. - return applyMapping(MI, BestMapping, RepairPts); + return applyMapping(MI, *BestMapping, RepairPts); } bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) { @@ -585,18 +615,12 @@ bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) { // LegalizerInfo as it's currently in the separate GlobalISel library. const MachineRegisterInfo &MRI = MF.getRegInfo(); if (const LegalizerInfo *MLI = MF.getSubtarget().getLegalizerInfo()) { - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI)) { - if (!TPC->isGlobalISelAbortEnabled()) { - MF.getProperties().set( - MachineFunctionProperties::Property::FailedISel); - return false; - } - std::string ErrStorage; - raw_string_ostream Err(ErrStorage); - Err << "Instruction is not legal: " << MI << '\n'; - report_fatal_error(Err.str()); + reportGISelFailure(MF, *TPC, *MORE, "gisel-regbankselect", + "instruction is not legal", MI); + return false; } } } @@ -622,9 +646,8 @@ bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) { continue; if (!assignInstr(MI)) { - if (TPC->isGlobalISelAbortEnabled()) - report_fatal_error("Unable to map instruction"); - MF.getProperties().set(MachineFunctionProperties::Property::FailedISel); + reportGISelFailure(MF, *TPC, *MORE, "gisel-regbankselect", + "unable to map instruction", MI); return false; } } @@ -640,11 +663,8 @@ RegBankSelect::RepairingPlacement::RepairingPlacement( MachineInstr &MI, unsigned OpIdx, const TargetRegisterInfo &TRI, Pass &P, RepairingPlacement::RepairingKind Kind) // Default is, we are going to insert code to repair OpIdx. - : Kind(Kind), - OpIdx(OpIdx), - CanMaterialize(Kind != RepairingKind::Impossible), - HasSplit(false), - P(P) { + : Kind(Kind), OpIdx(OpIdx), + CanMaterialize(Kind != RepairingKind::Impossible), P(P) { const MachineOperand &MO = MI.getOperand(OpIdx); assert(MO.isReg() && "Trying to repair a non-reg operand"); @@ -849,7 +869,7 @@ bool RegBankSelect::EdgeInsertPoint::canMaterialize() const { } RegBankSelect::MappingCost::MappingCost(const BlockFrequency &LocalFreq) - : LocalCost(0), NonLocalCost(0), LocalFreq(LocalFreq.getFrequency()) {} + : LocalFreq(LocalFreq.getFrequency()) {} bool RegBankSelect::MappingCost::addLocalCost(uint64_t Cost) { // Check if this overflows. @@ -922,7 +942,6 @@ bool RegBankSelect::MappingCost::operator<(const MappingCost &Cost) const { OtherLocalAdjust = Cost.LocalCost - LocalCost; else ThisLocalAdjust = LocalCost - Cost.LocalCost; - } else { ThisLocalAdjust = LocalCost; OtherLocalAdjust = Cost.LocalCost; @@ -968,10 +987,12 @@ bool RegBankSelect::MappingCost::operator==(const MappingCost &Cost) const { LocalFreq == Cost.LocalFreq; } -void RegBankSelect::MappingCost::dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void RegBankSelect::MappingCost::dump() const { print(dbgs()); dbgs() << '\n'; } +#endif void RegBankSelect::MappingCost::print(raw_ostream &OS) const { if (*this == ImpossibleCost()) { diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp index 49d676f..83b21e6 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp @@ -19,10 +19,11 @@ using namespace llvm; const unsigned RegisterBank::InvalidID = UINT_MAX; -RegisterBank::RegisterBank(unsigned ID, const char *Name, unsigned Size, - const uint32_t *CoveredClasses) +RegisterBank::RegisterBank( + unsigned ID, const char *Name, unsigned Size, + const uint32_t *CoveredClasses, unsigned NumRegClasses) : ID(ID), Name(Name), Size(Size) { - ContainedRegClasses.resize(200); + ContainedRegClasses.resize(NumRegClasses); ContainedRegClasses.setBitsInMask(CoveredClasses); } @@ -47,7 +48,7 @@ bool RegisterBank::verify(const TargetRegisterInfo &TRI) const { // Verify that the Size of the register bank is big enough to cover // all the register classes it covers. - assert((getSize() >= SubRC.getSize() * 8) && + assert(getSize() >= TRI.getRegSizeInBits(SubRC) && "Size is not big enough for all the subclasses!"); assert(covers(SubRC) && "Not all subclasses are covered"); } @@ -75,9 +76,11 @@ bool RegisterBank::operator==(const RegisterBank &OtherRB) const { return &OtherRB == this; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void RegisterBank::dump(const TargetRegisterInfo *TRI) const { print(dbgs(), /* IsForDebug */ true, TRI); } +#endif void RegisterBank::print(raw_ostream &OS, bool IsForDebug, const TargetRegisterInfo *TRI) const { diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp index da5ab0b..a841902 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp @@ -45,6 +45,10 @@ STATISTIC(NumOperandsMappingsCreated, "Number of operands mappings dynamically created"); STATISTIC(NumOperandsMappingsAccessed, "Number of operands mappings dynamically accessed"); +STATISTIC(NumInstructionMappingsCreated, + "Number of instruction mappings dynamically created"); +STATISTIC(NumInstructionMappingsAccessed, + "Number of instruction mappings dynamically accessed"); const unsigned RegisterBankInfo::DefaultMappingID = UINT_MAX; const unsigned RegisterBankInfo::InvalidMappingID = UINT_MAX - 1; @@ -63,13 +67,6 @@ RegisterBankInfo::RegisterBankInfo(RegisterBank **RegBanks, #endif // NDEBUG } -RegisterBankInfo::~RegisterBankInfo() { - for (auto It : MapOfPartialMappings) - delete It.second; - for (auto It : MapOfValueMappings) - delete It.second; -} - bool RegisterBankInfo::verify(const TargetRegisterInfo &TRI) const { #ifndef NDEBUG for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) { @@ -133,19 +130,27 @@ const TargetRegisterClass *RegisterBankInfo::constrainGenericRegister( return &RC; } -RegisterBankInfo::InstructionMapping +/// Check whether or not \p MI should be treated like a copy +/// for the mappings. +/// Copy like instruction are special for mapping because +/// they don't have actual register constraints. Moreover, +/// they sometimes have register classes assigned and we can +/// just use that instead of failing to provide a generic mapping. +static bool isCopyLike(const MachineInstr &MI) { + return MI.isCopy() || MI.isPHI() || + MI.getOpcode() == TargetOpcode::REG_SEQUENCE; +} + +const RegisterBankInfo::InstructionMapping & RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const { // For copies we want to walk over the operands and try to find one // that has a register bank since the instruction itself will not get // us any constraint. - bool isCopyLike = MI.isCopy() || MI.isPHI(); + bool IsCopyLike = isCopyLike(MI); // For copy like instruction, only the mapping of the definition // is important. The rest is not constrained. - unsigned NumOperandsForMapping = isCopyLike ? 1 : MI.getNumOperands(); + unsigned NumOperandsForMapping = IsCopyLike ? 1 : MI.getNumOperands(); - RegisterBankInfo::InstructionMapping Mapping(DefaultMappingID, /*Cost*/ 1, - /*OperandsMapping*/ nullptr, - NumOperandsForMapping); const MachineFunction &MF = *MI.getParent()->getParent(); const TargetSubtargetInfo &STI = MF.getSubtarget(); const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); @@ -175,7 +180,7 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const { // For copy-like instruction, we want to reuse the register bank // that is already set on Reg, if any, since those instructions do // not have any constraints. - const RegisterBank *CurRegBank = isCopyLike ? AltRegBank : nullptr; + const RegisterBank *CurRegBank = IsCopyLike ? AltRegBank : nullptr; if (!CurRegBank) { // If this is a target specific instruction, we can deduce // the register bank from the encoding constraints. @@ -184,15 +189,15 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const { // All our attempts failed, give up. CompleteMapping = false; - if (!isCopyLike) + if (!IsCopyLike) // MI does not carry enough information to guess the mapping. - return InstructionMapping(); + return getInvalidInstructionMapping(); continue; } } const ValueMapping *ValMapping = &getValueMapping(0, getSizeInBits(Reg, MRI, TRI), *CurRegBank); - if (isCopyLike) { + if (IsCopyLike) { OperandsMapping[0] = ValMapping; CompleteMapping = true; break; @@ -200,13 +205,15 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const { OperandsMapping[OpIdx] = ValMapping; } - if (isCopyLike && !CompleteMapping) + if (IsCopyLike && !CompleteMapping) // No way to deduce the type from what we have. - return InstructionMapping(); + return getInvalidInstructionMapping(); assert(CompleteMapping && "Setting an uncomplete mapping"); - Mapping.setOperandsMapping(getOperandsMapping(OperandsMapping)); - return Mapping; + return getInstructionMapping( + DefaultMappingID, /*Cost*/ 1, + /*OperandsMapping*/ getOperandsMapping(OperandsMapping), + NumOperandsForMapping); } /// Hashing function for PartialMapping. @@ -234,8 +241,8 @@ RegisterBankInfo::getPartialMapping(unsigned StartIdx, unsigned Length, ++NumPartialMappingsCreated; - const PartialMapping *&PartMapping = MapOfPartialMappings[Hash]; - PartMapping = new PartialMapping{StartIdx, Length, RegBank}; + auto &PartMapping = MapOfPartialMappings[Hash]; + PartMapping = llvm::make_unique<PartialMapping>(StartIdx, Length, RegBank); return *PartMapping; } @@ -268,8 +275,8 @@ RegisterBankInfo::getValueMapping(const PartialMapping *BreakDown, ++NumValueMappingsCreated; - const ValueMapping *&ValMapping = MapOfValueMappings[Hash]; - ValMapping = new ValueMapping{BreakDown, NumBreakDowns}; + auto &ValMapping = MapOfValueMappings[Hash]; + ValMapping = llvm::make_unique<ValueMapping>(BreakDown, NumBreakDowns); return *ValMapping; } @@ -282,9 +289,9 @@ RegisterBankInfo::getOperandsMapping(Iterator Begin, Iterator End) const { // The addresses of the value mapping are unique. // Therefore, we can use them directly to hash the operand mapping. hash_code Hash = hash_combine_range(Begin, End); - const auto &It = MapOfOperandsMappings.find(Hash); - if (It != MapOfOperandsMappings.end()) - return It->second; + auto &Res = MapOfOperandsMappings[Hash]; + if (Res) + return Res.get(); ++NumOperandsMappingsCreated; @@ -293,8 +300,7 @@ RegisterBankInfo::getOperandsMapping(Iterator Begin, Iterator End) const { // mapping, because we use the pointer of the ValueMapping // to hash and we expect them to uniquely identify an instance // of value mapping. - ValueMapping *&Res = MapOfOperandsMappings[Hash]; - Res = new ValueMapping[std::distance(Begin, End)]; + Res = llvm::make_unique<ValueMapping[]>(std::distance(Begin, End)); unsigned Idx = 0; for (Iterator It = Begin; It != End; ++It, ++Idx) { const ValueMapping *ValMap = *It; @@ -302,7 +308,7 @@ RegisterBankInfo::getOperandsMapping(Iterator Begin, Iterator End) const { continue; Res[Idx] = *ValMap; } - return Res; + return Res.get(); } const RegisterBankInfo::ValueMapping *RegisterBankInfo::getOperandsMapping( @@ -317,9 +323,44 @@ const RegisterBankInfo::ValueMapping *RegisterBankInfo::getOperandsMapping( return getOperandsMapping(OpdsMapping.begin(), OpdsMapping.end()); } -RegisterBankInfo::InstructionMapping +static hash_code +hashInstructionMapping(unsigned ID, unsigned Cost, + const RegisterBankInfo::ValueMapping *OperandsMapping, + unsigned NumOperands) { + return hash_combine(ID, Cost, OperandsMapping, NumOperands); +} + +const RegisterBankInfo::InstructionMapping & +RegisterBankInfo::getInstructionMappingImpl( + bool IsInvalid, unsigned ID, unsigned Cost, + const RegisterBankInfo::ValueMapping *OperandsMapping, + unsigned NumOperands) const { + assert(((IsInvalid && ID == InvalidMappingID && Cost == 0 && + OperandsMapping == nullptr && NumOperands == 0) || + !IsInvalid) && + "Mismatch argument for invalid input"); + ++NumInstructionMappingsAccessed; + + hash_code Hash = + hashInstructionMapping(ID, Cost, OperandsMapping, NumOperands); + const auto &It = MapOfInstructionMappings.find(Hash); + if (It != MapOfInstructionMappings.end()) + return *It->second; + + ++NumInstructionMappingsCreated; + + auto &InstrMapping = MapOfInstructionMappings[Hash]; + if (IsInvalid) + InstrMapping = llvm::make_unique<InstructionMapping>(); + else + InstrMapping = llvm::make_unique<InstructionMapping>( + ID, Cost, OperandsMapping, NumOperands); + return *InstrMapping; +} + +const RegisterBankInfo::InstructionMapping & RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { - RegisterBankInfo::InstructionMapping Mapping = getInstrMappingImpl(MI); + const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); if (Mapping.isValid()) return Mapping; llvm_unreachable("The target must implement this"); @@ -329,14 +370,14 @@ RegisterBankInfo::InstructionMappings RegisterBankInfo::getInstrPossibleMappings(const MachineInstr &MI) const { InstructionMappings PossibleMappings; // Put the default mapping first. - PossibleMappings.push_back(getInstrMapping(MI)); + PossibleMappings.push_back(&getInstrMapping(MI)); // Then the alternative mapping, if any. InstructionMappings AltMappings = getInstrAlternativeMappings(MI); - for (InstructionMapping &AltMapping : AltMappings) - PossibleMappings.emplace_back(std::move(AltMapping)); + for (const InstructionMapping *AltMapping : AltMappings) + PossibleMappings.push_back(AltMapping); #ifndef NDEBUG - for (const InstructionMapping &Mapping : PossibleMappings) - assert(Mapping.verify(MI) && "Mapping is invalid"); + for (const InstructionMapping *Mapping : PossibleMappings) + assert(Mapping->verify(MI) && "Mapping is invalid"); #endif return PossibleMappings; } @@ -349,6 +390,7 @@ RegisterBankInfo::getInstrAlternativeMappings(const MachineInstr &MI) const { void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) { MachineInstr &MI = OpdMapper.getMI(); + MachineRegisterInfo &MRI = OpdMapper.getMRI(); DEBUG(dbgs() << "Applying default-like mapping\n"); for (unsigned OpIdx = 0, EndIdx = OpdMapper.getInstrMapping().getNumOperands(); @@ -359,6 +401,13 @@ void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) { DEBUG(dbgs() << " is not a register, nothing to be done\n"); continue; } + if (!MO.getReg()) { + DEBUG(dbgs() << " is %%noreg, nothing to be done\n"); + continue; + } + assert(OpdMapper.getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns != + 0 && + "Invalid mapping"); assert(OpdMapper.getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns == 1 && "This mapping is too complex for this function"); @@ -368,9 +417,25 @@ void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) { DEBUG(dbgs() << " has not been repaired, nothing to be done\n"); continue; } - DEBUG(dbgs() << " changed, replace " << MO.getReg()); - MO.setReg(*NewRegs.begin()); - DEBUG(dbgs() << " with " << MO.getReg()); + unsigned OrigReg = MO.getReg(); + unsigned NewReg = *NewRegs.begin(); + DEBUG(dbgs() << " changed, replace " << PrintReg(OrigReg, nullptr)); + MO.setReg(NewReg); + DEBUG(dbgs() << " with " << PrintReg(NewReg, nullptr)); + + // The OperandsMapper creates plain scalar, we may have to fix that. + // Check if the types match and if not, fix that. + LLT OrigTy = MRI.getType(OrigReg); + LLT NewTy = MRI.getType(NewReg); + if (OrigTy != NewTy) { + assert(OrigTy.getSizeInBits() == NewTy.getSizeInBits() && + "Types with difference size cannot be handled by the default " + "mapping"); + DEBUG(dbgs() << "\nChange type of new opd from " << NewTy << " to " + << OrigTy); + MRI.setType(NewReg, OrigTy); + } + DEBUG(dbgs() << '\n'); } } @@ -394,16 +459,18 @@ unsigned RegisterBankInfo::getSizeInBits(unsigned Reg, RC = MRI.getRegClass(Reg); } assert(RC && "Unable to deduce the register class"); - return RC->getSize() * 8; + return TRI.getRegSizeInBits(*RC); } //------------------------------------------------------------------------------ // Helper classes implementation. //------------------------------------------------------------------------------ +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void RegisterBankInfo::PartialMapping::dump() const { print(dbgs()); dbgs() << '\n'; } +#endif bool RegisterBankInfo::PartialMapping::verify() const { assert(RegBank && "Register bank not set"); @@ -451,10 +518,12 @@ bool RegisterBankInfo::ValueMapping::verify(unsigned MeaningfulBitWidth) const { return true; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void RegisterBankInfo::ValueMapping::dump() const { print(dbgs()); dbgs() << '\n'; } +#endif void RegisterBankInfo::ValueMapping::print(raw_ostream &OS) const { OS << "#BreakDown: " << NumBreakDowns << " "; @@ -472,8 +541,7 @@ bool RegisterBankInfo::InstructionMapping::verify( // Check that all the register operands are properly mapped. // Check the constructor invariant. // For PHI, we only care about mapping the definition. - assert(NumOperands == - ((MI.isCopy() || MI.isPHI()) ? 1 : MI.getNumOperands()) && + assert(NumOperands == (isCopyLike(MI) ? 1 : MI.getNumOperands()) && "NumOperands must match, see constructor"); assert(MI.getParent() && MI.getParent()->getParent() && "MI must be connected to a MachineFunction"); @@ -503,10 +571,12 @@ bool RegisterBankInfo::InstructionMapping::verify( return true; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void RegisterBankInfo::InstructionMapping::dump() const { print(dbgs()); dbgs() << '\n'; } +#endif void RegisterBankInfo::InstructionMapping::print(raw_ostream &OS) const { OS << "ID: " << getID() << " Cost: " << getCost() << " Mapping: "; @@ -576,6 +646,11 @@ void RegisterBankInfo::OperandsMapper::createVRegs(unsigned OpIdx) { for (unsigned &NewVReg : NewVRegsForOpIdx) { assert(PartMap != ValMapping.end() && "Out-of-bound access"); assert(NewVReg == 0 && "Register has already been created"); + // The new registers are always bound to scalar with the right size. + // The actual type has to be set when the target does the mapping + // of the instruction. + // The rationale is that this generic code cannot guess how the + // target plans to split the input type. NewVReg = MRI.createGenericVirtualRegister(LLT::scalar(PartMap->Length)); MRI.setRegBank(NewVReg, *PartMap->RegBank); ++PartMap; @@ -619,10 +694,12 @@ RegisterBankInfo::OperandsMapper::getVRegs(unsigned OpIdx, return Res; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void RegisterBankInfo::OperandsMapper::dump() const { print(dbgs(), true); dbgs() << '\n'; } +#endif void RegisterBankInfo::OperandsMapper::print(raw_ostream &OS, bool ForDebug) const { diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/Utils.cpp index e500918..5ecaf5c 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -11,10 +11,14 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/ADT/Twine.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Constants.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" @@ -22,6 +26,23 @@ using namespace llvm; +unsigned llvm::constrainRegToClass(MachineRegisterInfo &MRI, + const TargetInstrInfo &TII, + const RegisterBankInfo &RBI, + MachineInstr &InsertPt, unsigned Reg, + const TargetRegisterClass &RegClass) { + if (!RBI.constrainGenericRegister(Reg, RegClass, MRI)) { + unsigned NewReg = MRI.createVirtualRegister(&RegClass); + BuildMI(*InsertPt.getParent(), InsertPt, InsertPt.getDebugLoc(), + TII.get(TargetOpcode::COPY), NewReg) + .addReg(Reg); + return NewReg; + } + + return Reg; +} + + unsigned llvm::constrainOperandRegClass( const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, @@ -32,14 +53,76 @@ unsigned llvm::constrainOperandRegClass( "PhysReg not implemented"); const TargetRegisterClass *RegClass = TII.getRegClass(II, OpIdx, &TRI, MF); + return constrainRegToClass(MRI, TII, RBI, InsertPt, Reg, *RegClass); +} - if (!RBI.constrainGenericRegister(Reg, *RegClass, MRI)) { - unsigned NewReg = MRI.createVirtualRegister(RegClass); - BuildMI(*InsertPt.getParent(), InsertPt, InsertPt.getDebugLoc(), - TII.get(TargetOpcode::COPY), NewReg) - .addReg(Reg); - return NewReg; +bool llvm::isTriviallyDead(const MachineInstr &MI, + const MachineRegisterInfo &MRI) { + // If we can move an instruction, we can remove it. Otherwise, it has + // a side-effect of some sort. + bool SawStore = false; + if (!MI.isSafeToMove(/*AA=*/nullptr, SawStore)) + return false; + + // Instructions without side-effects are dead iff they only define dead vregs. + for (auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.isDef()) + continue; + + unsigned Reg = MO.getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg) || + !MRI.use_nodbg_empty(Reg)) + return false; } + return true; +} - return Reg; +void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC, + MachineOptimizationRemarkEmitter &MORE, + MachineOptimizationRemarkMissed &R) { + MF.getProperties().set(MachineFunctionProperties::Property::FailedISel); + + // Print the function name explicitly if we don't have a debug location (which + // makes the diagnostic less useful) or if we're going to emit a raw error. + if (!R.getLocation().isValid() || TPC.isGlobalISelAbortEnabled()) + R << (" (in function: " + MF.getName() + ")").str(); + + if (TPC.isGlobalISelAbortEnabled()) + report_fatal_error(R.getMsg()); + else + MORE.emit(R); +} + +void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC, + MachineOptimizationRemarkEmitter &MORE, + const char *PassName, StringRef Msg, + const MachineInstr &MI) { + MachineOptimizationRemarkMissed R(PassName, "GISelFailure: ", + MI.getDebugLoc(), MI.getParent()); + R << Msg << ": " << ore::MNV("Inst", MI); + reportGISelFailure(MF, TPC, MORE, R); +} + +Optional<int64_t> llvm::getConstantVRegVal(unsigned VReg, + const MachineRegisterInfo &MRI) { + MachineInstr *MI = MRI.getVRegDef(VReg); + if (MI->getOpcode() != TargetOpcode::G_CONSTANT) + return None; + + if (MI->getOperand(1).isImm()) + return MI->getOperand(1).getImm(); + + if (MI->getOperand(1).isCImm() && + MI->getOperand(1).getCImm()->getBitWidth() <= 64) + return MI->getOperand(1).getCImm()->getSExtValue(); + + return None; +} + +const llvm::ConstantFP* llvm::getConstantFPVRegVal(unsigned VReg, + const MachineRegisterInfo &MRI) { + MachineInstr *MI = MRI.getVRegDef(VReg); + if (TargetOpcode::G_FCONSTANT != MI->getOpcode()) + return nullptr; + return MI->getOperand(1).getFPImm(); } diff --git a/contrib/llvm/lib/CodeGen/GlobalMerge.cpp b/contrib/llvm/lib/CodeGen/GlobalMerge.cpp index 1ea5349..c6ca49c 100644 --- a/contrib/llvm/lib/CodeGen/GlobalMerge.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalMerge.cpp @@ -192,10 +192,7 @@ namespace { } // end anonymous namespace char GlobalMerge::ID = 0; -INITIALIZE_PASS_BEGIN(GlobalMerge, "global-merge", "Merge global variables", - false, false) -INITIALIZE_PASS_END(GlobalMerge, "global-merge", "Merge global variables", - false, false) +INITIALIZE_PASS(GlobalMerge, DEBUG_TYPE, "Merge global variables", false, false) bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, Module &M, bool isConst, unsigned AddrSpace) const { @@ -556,7 +553,12 @@ bool GlobalMerge::doInitialization(Module &M) { // Grab all non-const globals. for (auto &GV : M.globals()) { // Merge is safe for "normal" internal or external globals only - if (GV.isDeclaration() || GV.isThreadLocal() || GV.hasSection()) + if (GV.isDeclaration() || GV.isThreadLocal() || + GV.hasSection() || GV.hasImplicitSection()) + continue; + + // It's not safe to merge globals that may be preempted + if (TM && !TM->shouldAssumeDSOLocal(M, &GV)) continue; if (!(MergeExternalGlobals && GV.hasExternalLinkage()) && diff --git a/contrib/llvm/lib/CodeGen/IfConversion.cpp b/contrib/llvm/lib/CodeGen/IfConversion.cpp index b9f3d86..ff84053 100644 --- a/contrib/llvm/lib/CodeGen/IfConversion.cpp +++ b/contrib/llvm/lib/CodeGen/IfConversion.cpp @@ -12,7 +12,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "BranchFolding.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ScopeExit.h" @@ -25,6 +24,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetSchedule.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -39,7 +39,7 @@ using namespace llvm; -#define DEBUG_TYPE "ifcvt" +#define DEBUG_TYPE "if-converter" // Hidden options for help debugging. static cl::opt<int> IfCvtFnStart("ifcvt-fn-start", cl::init(-1), cl::Hidden); @@ -316,9 +316,9 @@ namespace { char &llvm::IfConverterID = IfConverter::ID; -INITIALIZE_PASS_BEGIN(IfConverter, "if-converter", "If Converter", false, false) +INITIALIZE_PASS_BEGIN(IfConverter, DEBUG_TYPE, "If Converter", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) -INITIALIZE_PASS_END(IfConverter, "if-converter", "If Converter", false, false) +INITIALIZE_PASS_END(IfConverter, DEBUG_TYPE, "If Converter", false, false) bool IfConverter::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction()) || (PredicateFtor && !PredicateFtor(MF))) @@ -588,19 +588,6 @@ bool IfConverter::ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI, return TExit && TExit == FalseBBI.BB; } -/// Shrink the provided inclusive range by one instruction. -/// If the range was one instruction (\p It == \p Begin), It is not modified, -/// but \p Empty is set to true. -static inline void shrinkInclusiveRange( - MachineBasicBlock::iterator &Begin, - MachineBasicBlock::iterator &It, - bool &Empty) { - if (It == Begin) - Empty = true; - else - It--; -} - /// Count duplicated instructions and move the iterators to show where they /// are. /// @param TIB True Iterator Begin @@ -633,10 +620,8 @@ bool IfConverter::CountDuplicatedInstructions( while (TIB != TIE && FIB != FIE) { // Skip dbg_value instructions. These do not count. TIB = skipDebugInstructionsForward(TIB, TIE); - if(TIB == TIE) - break; FIB = skipDebugInstructionsForward(FIB, FIE); - if(FIB == FIE) + if (TIB == TIE || FIB == FIE) break; if (!TIB->isIdenticalTo(*FIB)) break; @@ -656,58 +641,42 @@ bool IfConverter::CountDuplicatedInstructions( if (TIB == TIE || FIB == FIE) return true; // Now, in preparation for counting duplicate instructions at the ends of the - // blocks, move the end iterators up past any branch instructions. - --TIE; - --FIE; - - // After this point TIB and TIE define an inclusive range, which means that - // TIB == TIE is true when there is one more instruction to consider, not at - // the end. Because we may not be able to go before TIB, we need a flag to - // indicate a completely empty range. - bool TEmpty = false, FEmpty = false; - - // Upon exit TIE and FIE will both point at the last non-shared instruction. - // They need to be moved forward to point past the last non-shared - // instruction if the range they delimit is non-empty. - auto IncrementEndIteratorsOnExit = make_scope_exit([&]() { - if (!TEmpty) - ++TIE; - if (!FEmpty) - ++FIE; - }); + // blocks, switch to reverse_iterators. Note that getReverse() returns an + // iterator that points to the same instruction, unlike std::reverse_iterator. + // We have to do our own shifting so that we get the same range. + MachineBasicBlock::reverse_iterator RTIE = std::next(TIE.getReverse()); + MachineBasicBlock::reverse_iterator RFIE = std::next(FIE.getReverse()); + const MachineBasicBlock::reverse_iterator RTIB = std::next(TIB.getReverse()); + const MachineBasicBlock::reverse_iterator RFIB = std::next(FIB.getReverse()); if (!TBB.succ_empty() || !FBB.succ_empty()) { if (SkipUnconditionalBranches) { - while (!TEmpty && TIE->isUnconditionalBranch()) - shrinkInclusiveRange(TIB, TIE, TEmpty); - while (!FEmpty && FIE->isUnconditionalBranch()) - shrinkInclusiveRange(FIB, FIE, FEmpty); + while (RTIE != RTIB && RTIE->isUnconditionalBranch()) + ++RTIE; + while (RFIE != RFIB && RFIE->isUnconditionalBranch()) + ++RFIE; } } - // If Dups1 includes all of a block, then don't count duplicate - // instructions at the end of the blocks. - if (TEmpty || FEmpty) - return true; - // Count duplicate instructions at the ends of the blocks. - while (!TEmpty && !FEmpty) { + while (RTIE != RTIB && RFIE != RFIB) { // Skip dbg_value instructions. These do not count. - TIE = skipDebugInstructionsBackward(TIE, TIB); - FIE = skipDebugInstructionsBackward(FIE, FIB); - TEmpty = TIE == TIB && TIE->isDebugValue(); - FEmpty = FIE == FIB && FIE->isDebugValue(); - if (TEmpty || FEmpty) + // Note that these are reverse iterators going forward. + RTIE = skipDebugInstructionsForward(RTIE, RTIB); + RFIE = skipDebugInstructionsForward(RFIE, RFIB); + if (RTIE == RTIB || RFIE == RFIB) break; - if (!TIE->isIdenticalTo(*FIE)) + if (!RTIE->isIdenticalTo(*RFIE)) break; // We have to verify that any branch instructions are the same, and then we // don't count them toward the # of duplicate instructions. - if (!TIE->isBranch()) + if (!RTIE->isBranch()) ++Dups2; - shrinkInclusiveRange(TIB, TIE, TEmpty); - shrinkInclusiveRange(FIB, FIE, FEmpty); + ++RTIE; + ++RFIE; } + TIE = std::next(RTIE.getReverse()); + FIE = std::next(RFIE.getReverse()); return true; } @@ -741,25 +710,21 @@ bool IfConverter::RescanInstructions( static void verifySameBranchInstructions( MachineBasicBlock *MBB1, MachineBasicBlock *MBB2) { - MachineBasicBlock::iterator B1 = MBB1->begin(); - MachineBasicBlock::iterator B2 = MBB2->begin(); - MachineBasicBlock::iterator E1 = std::prev(MBB1->end()); - MachineBasicBlock::iterator E2 = std::prev(MBB2->end()); - bool Empty1 = false, Empty2 = false; - while (!Empty1 && !Empty2) { - E1 = skipDebugInstructionsBackward(E1, B1); - E2 = skipDebugInstructionsBackward(E2, B2); - Empty1 = E1 == B1 && E1->isDebugValue(); - Empty2 = E2 == B2 && E2->isDebugValue(); - - if (Empty1 && Empty2) + const MachineBasicBlock::reverse_iterator B1 = MBB1->rend(); + const MachineBasicBlock::reverse_iterator B2 = MBB2->rend(); + MachineBasicBlock::reverse_iterator E1 = MBB1->rbegin(); + MachineBasicBlock::reverse_iterator E2 = MBB2->rbegin(); + while (E1 != B1 && E2 != B2) { + skipDebugInstructionsForward(E1, B1); + skipDebugInstructionsForward(E2, B2); + if (E1 == B1 && E2 == B2) break; - if (Empty1) { + if (E1 == B1) { assert(!E2->isBranch() && "Branch mis-match, one block is empty."); break; } - if (Empty2) { + if (E2 == B2) { assert(!E1->isBranch() && "Branch mis-match, one block is empty."); break; } @@ -769,8 +734,8 @@ static void verifySameBranchInstructions( "Branch mis-match, branch instructions don't match."); else break; - shrinkInclusiveRange(B1, E1, Empty1); - shrinkInclusiveRange(B2, E2, Empty2); + ++E1; + ++E2; } } #endif @@ -1353,7 +1318,8 @@ static bool canFallThroughTo(MachineBasicBlock &MBB, MachineBasicBlock &ToMBB) { return false; PI = I++; } - return true; + // Finally see if the last I is indeed a successor to PI. + return PI->isSuccessor(&*I); } /// Invalidate predecessor BB info so it would be re-analyzed to determine if it @@ -1508,8 +1474,11 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) { DontKill.addLiveIns(NextMBB); } + // Remove the branches from the entry so we can add the contents of the true + // block to it. + BBI.NonPredSize -= TII->removeBranch(*BBI.BB); + if (CvtMBB.pred_size() > 1) { - BBI.NonPredSize -= TII->removeBranch(*BBI.BB); // Copy instructions in the true block, predicate them, and add them to // the entry block. CopyAndPredicateBlock(BBI, *CvtBBI, Cond); @@ -1518,11 +1487,11 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) { // explicitly remove CvtBBI as a successor. BBI.BB->removeSuccessor(&CvtMBB, true); } else { + // Predicate the instructions in the true block. RemoveKills(CvtMBB.begin(), CvtMBB.end(), DontKill, *TRI); PredicateBlock(*CvtBBI, CvtMBB.end(), Cond); // Merge converted block into entry block. - BBI.NonPredSize -= TII->removeBranch(*BBI.BB); MergeBlocks(BBI, *CvtBBI); } @@ -1622,8 +1591,11 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { BBCvt = MBPI->getEdgeProbability(BBI.BB, &CvtMBB); } + // Remove the branches from the entry so we can add the contents of the true + // block to it. + BBI.NonPredSize -= TII->removeBranch(*BBI.BB); + if (CvtMBB.pred_size() > 1) { - BBI.NonPredSize -= TII->removeBranch(*BBI.BB); // Copy instructions in the true block, predicate them, and add them to // the entry block. CopyAndPredicateBlock(BBI, *CvtBBI, Cond, true); @@ -1637,7 +1609,6 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { PredicateBlock(*CvtBBI, CvtMBB.end(), Cond); // Now merge the entry of the triangle with the true block. - BBI.NonPredSize -= TII->removeBranch(*BBI.BB); MergeBlocks(BBI, *CvtBBI, false); } @@ -2183,7 +2154,8 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) { // unknown probabilities into known ones. // FIXME: This usage is too tricky and in the future we would like to // eliminate all unknown probabilities in MBB. - ToBBI.BB->normalizeSuccProbs(); + if (ToBBI.IsBrAnalyzable) + ToBBI.BB->normalizeSuccProbs(); SmallVector<MachineBasicBlock *, 4> FromSuccs(FromMBB.succ_begin(), FromMBB.succ_end()); @@ -2263,7 +2235,8 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) { // Normalize the probabilities of ToBBI.BB's successors with all adjustment // we've done above. - ToBBI.BB->normalizeSuccProbs(); + if (ToBBI.IsBrAnalyzable && FromBBI.IsBrAnalyzable) + ToBBI.BB->normalizeSuccProbs(); ToBBI.Predicate.append(FromBBI.Predicate.begin(), FromBBI.Predicate.end()); FromBBI.Predicate.clear(); diff --git a/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp b/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp index 9588dfb..e308f49 100644 --- a/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp +++ b/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp @@ -22,6 +22,7 @@ // With the help of a runtime that understands the .fault_maps section, // faulting_load_op branches to throw_npe if executing movl (%r10), %esi incurs // a page fault. +// Store and LoadStore are also supported. // //===----------------------------------------------------------------------===// @@ -29,21 +30,22 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/FaultMaps.h" #include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineMemOperand.h" -#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetSubtargetInfo.h" #include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; @@ -151,25 +153,44 @@ class ImplicitNullChecks : public MachineFunctionPass { const TargetRegisterInfo *TRI = nullptr; AliasAnalysis *AA = nullptr; MachineModuleInfo *MMI = nullptr; + MachineFrameInfo *MFI = nullptr; bool analyzeBlockForNullChecks(MachineBasicBlock &MBB, SmallVectorImpl<NullCheck> &NullCheckList); - MachineInstr *insertFaultingLoad(MachineInstr *LoadMI, MachineBasicBlock *MBB, - MachineBasicBlock *HandlerMBB); + MachineInstr *insertFaultingInstr(MachineInstr *MI, MachineBasicBlock *MBB, + MachineBasicBlock *HandlerMBB); void rewriteNullChecks(ArrayRef<NullCheck> NullCheckList); - /// Is \p MI a memory operation that can be used to implicitly null check the - /// value in \p PointerReg? \p PrevInsts is the set of instruction seen since + enum AliasResult { + AR_NoAlias, + AR_MayAlias, + AR_WillAliasEverything + }; + /// Returns AR_NoAlias if \p MI memory operation does not alias with + /// \p PrevMI, AR_MayAlias if they may alias and AR_WillAliasEverything if + /// they may alias and any further memory operation may alias with \p PrevMI. + AliasResult areMemoryOpsAliased(MachineInstr &MI, MachineInstr *PrevMI); + + enum SuitabilityResult { + SR_Suitable, + SR_Unsuitable, + SR_Impossible + }; + /// Return SR_Suitable if \p MI a memory operation that can be used to + /// implicitly null check the value in \p PointerReg, SR_Unsuitable if + /// \p MI cannot be used to null check and SR_Impossible if there is + /// no sense to continue lookup due to any other instruction will not be able + /// to be used. \p PrevInsts is the set of instruction seen since /// the explicit null check on \p PointerReg. - bool isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg, - ArrayRef<MachineInstr *> PrevInsts); + SuitabilityResult isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg, + ArrayRef<MachineInstr *> PrevInsts); /// Return true if \p FaultingMI can be hoisted from after the the /// instructions in \p InstsSeenSoFar to before them. Set \p Dependence to a /// non-null value if we also need to (and legally can) hoist a depedency. - bool canHoistLoadInst(MachineInstr *FaultingMI, unsigned PointerReg, - ArrayRef<MachineInstr *> InstsSeenSoFar, - MachineBasicBlock *NullSucc, MachineInstr *&Dependence); + bool canHoistInst(MachineInstr *FaultingMI, unsigned PointerReg, + ArrayRef<MachineInstr *> InstsSeenSoFar, + MachineBasicBlock *NullSucc, MachineInstr *&Dependence); public: static char ID; @@ -193,7 +214,7 @@ public: } bool ImplicitNullChecks::canHandle(const MachineInstr *MI) { - if (MI->isCall() || MI->mayStore() || MI->hasUnmodeledSideEffects()) + if (MI->isCall() || MI->hasUnmodeledSideEffects()) return false; auto IsRegMask = [](const MachineOperand &MO) { return MO.isRegMask(); }; (void)IsRegMask; @@ -248,7 +269,7 @@ bool ImplicitNullChecks::canReorder(const MachineInstr *A, unsigned RegB = MOB.getReg(); - if (TRI->regsOverlap(RegA, RegB)) + if (TRI->regsOverlap(RegA, RegB) && (MOA.isDef() || MOB.isDef())) return false; } } @@ -260,6 +281,7 @@ bool ImplicitNullChecks::runOnMachineFunction(MachineFunction &MF) { TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getRegInfo().getTargetRegisterInfo(); MMI = &MF.getMMI(); + MFI = &MF.getFrameInfo(); AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); SmallVector<NullCheck, 16> NullCheckList; @@ -283,36 +305,76 @@ static bool AnyAliasLiveIn(const TargetRegisterInfo *TRI, return false; } -bool ImplicitNullChecks::isSuitableMemoryOp( - MachineInstr &MI, unsigned PointerReg, ArrayRef<MachineInstr *> PrevInsts) { +ImplicitNullChecks::AliasResult +ImplicitNullChecks::areMemoryOpsAliased(MachineInstr &MI, + MachineInstr *PrevMI) { + // If it is not memory access, skip the check. + if (!(PrevMI->mayStore() || PrevMI->mayLoad())) + return AR_NoAlias; + // Load-Load may alias + if (!(MI.mayStore() || PrevMI->mayStore())) + return AR_NoAlias; + // We lost info, conservatively alias. If it was store then no sense to + // continue because we won't be able to check against it further. + if (MI.memoperands_empty()) + return MI.mayStore() ? AR_WillAliasEverything : AR_MayAlias; + if (PrevMI->memoperands_empty()) + return PrevMI->mayStore() ? AR_WillAliasEverything : AR_MayAlias; + + for (MachineMemOperand *MMO1 : MI.memoperands()) { + // MMO1 should have a value due it comes from operation we'd like to use + // as implicit null check. + assert(MMO1->getValue() && "MMO1 should have a Value!"); + for (MachineMemOperand *MMO2 : PrevMI->memoperands()) { + if (const PseudoSourceValue *PSV = MMO2->getPseudoValue()) { + if (PSV->mayAlias(MFI)) + return AR_MayAlias; + continue; + } + llvm::AliasResult AAResult = AA->alias( + MemoryLocation(MMO1->getValue(), MemoryLocation::UnknownSize, + MMO1->getAAInfo()), + MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize, + MMO2->getAAInfo())); + if (AAResult != NoAlias) + return AR_MayAlias; + } + } + return AR_NoAlias; +} + +ImplicitNullChecks::SuitabilityResult +ImplicitNullChecks::isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg, + ArrayRef<MachineInstr *> PrevInsts) { int64_t Offset; unsigned BaseReg; if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI) || BaseReg != PointerReg) - return false; - - // We want the load to be issued at a sane offset from PointerReg, so that - // if PointerReg is null then the load reliably page faults. - if (!(MI.mayLoad() && !MI.isPredicable() && Offset < PageSize)) - return false; - - // Finally, we need to make sure that the load instruction actually is - // loading from PointerReg, and there isn't some re-definition of PointerReg - // between the compare and the load. - for (auto *PrevMI : PrevInsts) - for (auto &PrevMO : PrevMI->operands()) - if (PrevMO.isReg() && PrevMO.getReg() && - TRI->regsOverlap(PrevMO.getReg(), PointerReg)) - return false; - - return true; + return SR_Unsuitable; + + // We want the mem access to be issued at a sane offset from PointerReg, + // so that if PointerReg is null then the access reliably page faults. + if (!((MI.mayLoad() || MI.mayStore()) && !MI.isPredicable() && + Offset < PageSize)) + return SR_Unsuitable; + + // Finally, check whether the current memory access aliases with previous one. + for (auto *PrevMI : PrevInsts) { + AliasResult AR = areMemoryOpsAliased(MI, PrevMI); + if (AR == AR_WillAliasEverything) + return SR_Impossible; + if (AR == AR_MayAlias) + return SR_Unsuitable; + } + return SR_Suitable; } -bool ImplicitNullChecks::canHoistLoadInst( - MachineInstr *FaultingMI, unsigned PointerReg, - ArrayRef<MachineInstr *> InstsSeenSoFar, MachineBasicBlock *NullSucc, - MachineInstr *&Dependence) { +bool ImplicitNullChecks::canHoistInst(MachineInstr *FaultingMI, + unsigned PointerReg, + ArrayRef<MachineInstr *> InstsSeenSoFar, + MachineBasicBlock *NullSucc, + MachineInstr *&Dependence) { auto DepResult = computeDependence(FaultingMI, InstsSeenSoFar); if (!DepResult.CanReorder) return false; @@ -359,7 +421,8 @@ bool ImplicitNullChecks::canHoistLoadInst( // The Dependency can't be re-defining the base register -- then we won't // get the memory operation on the address we want. This is already // checked in \c IsSuitableMemoryOp. - assert(!TRI->regsOverlap(DependenceMO.getReg(), PointerReg) && + assert(!(DependenceMO.isDef() && + TRI->regsOverlap(DependenceMO.getReg(), PointerReg)) && "Should have been checked before!"); } @@ -481,50 +544,76 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks( return false; MachineInstr *Dependence; - if (isSuitableMemoryOp(MI, PointerReg, InstsSeenSoFar) && - canHoistLoadInst(&MI, PointerReg, InstsSeenSoFar, NullSucc, - Dependence)) { + SuitabilityResult SR = isSuitableMemoryOp(MI, PointerReg, InstsSeenSoFar); + if (SR == SR_Impossible) + return false; + if (SR == SR_Suitable && + canHoistInst(&MI, PointerReg, InstsSeenSoFar, NullSucc, Dependence)) { NullCheckList.emplace_back(&MI, MBP.ConditionDef, &MBB, NotNullSucc, NullSucc, Dependence); return true; } + // If MI re-defines the PointerReg then we cannot move further. + if (any_of(MI.operands(), [&](MachineOperand &MO) { + return MO.isReg() && MO.getReg() && MO.isDef() && + TRI->regsOverlap(MO.getReg(), PointerReg); + })) + return false; InstsSeenSoFar.push_back(&MI); } return false; } -/// Wrap a machine load instruction, LoadMI, into a FAULTING_LOAD_OP machine -/// instruction. The FAULTING_LOAD_OP instruction does the same load as LoadMI -/// (defining the same register), and branches to HandlerMBB if the load -/// faults. The FAULTING_LOAD_OP instruction is inserted at the end of MBB. -MachineInstr * -ImplicitNullChecks::insertFaultingLoad(MachineInstr *LoadMI, - MachineBasicBlock *MBB, - MachineBasicBlock *HandlerMBB) { +/// Wrap a machine instruction, MI, into a FAULTING machine instruction. +/// The FAULTING instruction does the same load/store as MI +/// (defining the same register), and branches to HandlerMBB if the mem access +/// faults. The FAULTING instruction is inserted at the end of MBB. +MachineInstr *ImplicitNullChecks::insertFaultingInstr( + MachineInstr *MI, MachineBasicBlock *MBB, MachineBasicBlock *HandlerMBB) { const unsigned NoRegister = 0; // Guaranteed to be the NoRegister value for // all targets. DebugLoc DL; - unsigned NumDefs = LoadMI->getDesc().getNumDefs(); + unsigned NumDefs = MI->getDesc().getNumDefs(); assert(NumDefs <= 1 && "other cases unhandled!"); unsigned DefReg = NoRegister; if (NumDefs != 0) { - DefReg = LoadMI->defs().begin()->getReg(); - assert(std::distance(LoadMI->defs().begin(), LoadMI->defs().end()) == 1 && + DefReg = MI->defs().begin()->getReg(); + assert(std::distance(MI->defs().begin(), MI->defs().end()) == 1 && "expected exactly one def!"); } - auto MIB = BuildMI(MBB, DL, TII->get(TargetOpcode::FAULTING_LOAD_OP), DefReg) - .addMBB(HandlerMBB) - .addImm(LoadMI->getOpcode()); + FaultMaps::FaultKind FK; + if (MI->mayLoad()) + FK = + MI->mayStore() ? FaultMaps::FaultingLoadStore : FaultMaps::FaultingLoad; + else + FK = FaultMaps::FaultingStore; - for (auto &MO : LoadMI->uses()) - MIB.addOperand(MO); + auto MIB = BuildMI(MBB, DL, TII->get(TargetOpcode::FAULTING_OP), DefReg) + .addImm(FK) + .addMBB(HandlerMBB) + .addImm(MI->getOpcode()); + + for (auto &MO : MI->uses()) { + if (MO.isReg()) { + MachineOperand NewMO = MO; + if (MO.isUse()) { + NewMO.setIsKill(false); + } else { + assert(MO.isDef() && "Expected def or use"); + NewMO.setIsDead(false); + } + MIB.add(NewMO); + } else { + MIB.add(MO); + } + } - MIB.setMemRefs(LoadMI->memoperands_begin(), LoadMI->memoperands_end()); + MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); return MIB; } @@ -545,18 +634,18 @@ void ImplicitNullChecks::rewriteNullChecks( NC.getCheckBlock()->insert(NC.getCheckBlock()->end(), DepMI); } - // Insert a faulting load where the conditional branch was originally. We - // check earlier ensures that this bit of code motion is legal. We do not - // touch the successors list for any basic block since we haven't changed - // control flow, we've just made it implicit. - MachineInstr *FaultingLoad = insertFaultingLoad( + // Insert a faulting instruction where the conditional branch was + // originally. We check earlier ensures that this bit of code motion + // is legal. We do not touch the successors list for any basic block + // since we haven't changed control flow, we've just made it implicit. + MachineInstr *FaultingInstr = insertFaultingInstr( NC.getMemOperation(), NC.getCheckBlock(), NC.getNullSucc()); // Now the values defined by MemOperation, if any, are live-in of // the block of MemOperation. - // The original load operation may define implicit-defs alongside - // the loaded value. + // The original operation may define implicit-defs alongside + // the value. MachineBasicBlock *MBB = NC.getMemOperation()->getParent(); - for (const MachineOperand &MO : FaultingLoad->operands()) { + for (const MachineOperand &MO : FaultingInstr->operands()) { if (!MO.isReg() || !MO.isDef()) continue; unsigned Reg = MO.getReg(); @@ -588,8 +677,8 @@ void ImplicitNullChecks::rewriteNullChecks( char ImplicitNullChecks::ID = 0; char &llvm::ImplicitNullChecksID = ImplicitNullChecks::ID; -INITIALIZE_PASS_BEGIN(ImplicitNullChecks, "implicit-null-checks", +INITIALIZE_PASS_BEGIN(ImplicitNullChecks, DEBUG_TYPE, "Implicit null checks", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(ImplicitNullChecks, "implicit-null-checks", +INITIALIZE_PASS_END(ImplicitNullChecks, DEBUG_TYPE, "Implicit null checks", false, false) diff --git a/contrib/llvm/lib/CodeGen/InlineSpiller.cpp b/contrib/llvm/lib/CodeGen/InlineSpiller.cpp index 3d81184..eda4f74 100644 --- a/contrib/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/contrib/llvm/lib/CodeGen/InlineSpiller.cpp @@ -558,7 +558,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) { Edit->rematerializeAt(*MI.getParent(), MI, NewVReg, RM, TRI); // We take the DebugLoc from MI, since OrigMI may be attributed to a - // different source location. + // different source location. auto *NewMI = LIS.getInstructionFromIndex(DefIdx); NewMI->setDebugLoc(MI.getDebugLoc()); @@ -643,8 +643,11 @@ void InlineSpiller::reMaterializeAll() { Edit->eraseVirtReg(Reg); continue; } - assert((LIS.hasInterval(Reg) && !LIS.getInterval(Reg).empty()) && - "Reg with empty interval has reference"); + + assert(LIS.hasInterval(Reg) && + (!LIS.getInterval(Reg).empty() || !MRI.reg_nodbg_empty(Reg)) && + "Empty and not used live-range?!"); + RegsToSpill[ResultPos++] = Reg; } RegsToSpill.erase(RegsToSpill.begin() + ResultPos, RegsToSpill.end()); @@ -686,7 +689,8 @@ bool InlineSpiller::coalesceStackAccess(MachineInstr *MI, unsigned Reg) { return true; } -#if !defined(NDEBUG) +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD // Dump the range of instructions from B to E with their slot indexes. static void dumpMachineInstrRangeWithSlotIndex(MachineBasicBlock::iterator B, MachineBasicBlock::iterator E, @@ -856,21 +860,46 @@ void InlineSpiller::insertReload(unsigned NewVReg, ++NumReloads; } +/// Check if \p Def fully defines a VReg with an undefined value. +/// If that's the case, that means the value of VReg is actually +/// not relevant. +static bool isFullUndefDef(const MachineInstr &Def) { + if (!Def.isImplicitDef()) + return false; + assert(Def.getNumOperands() == 1 && + "Implicit def with more than one definition"); + // We can say that the VReg defined by Def is undef, only if it is + // fully defined by Def. Otherwise, some of the lanes may not be + // undef and the value of the VReg matters. + return !Def.getOperand(0).getSubReg(); +} + /// insertSpill - Insert a spill of NewVReg after MI. void InlineSpiller::insertSpill(unsigned NewVReg, bool isKill, MachineBasicBlock::iterator MI) { MachineBasicBlock &MBB = *MI->getParent(); MachineInstrSpan MIS(MI); - TII.storeRegToStackSlot(MBB, std::next(MI), NewVReg, isKill, StackSlot, - MRI.getRegClass(NewVReg), &TRI); + bool IsRealSpill = true; + if (isFullUndefDef(*MI)) { + // Don't spill undef value. + // Anything works for undef, in particular keeping the memory + // uninitialized is a viable option and it saves code size and + // run time. + BuildMI(MBB, std::next(MI), MI->getDebugLoc(), TII.get(TargetOpcode::KILL)) + .addReg(NewVReg, getKillRegState(isKill)); + IsRealSpill = false; + } else + TII.storeRegToStackSlot(MBB, std::next(MI), NewVReg, isKill, StackSlot, + MRI.getRegClass(NewVReg), &TRI); LIS.InsertMachineInstrRangeInMaps(std::next(MI), MIS.end()); DEBUG(dumpMachineInstrRangeWithSlotIndex(std::next(MI), MIS.end(), LIS, "spill")); ++NumSpills; - HSpiller.addToMergeableSpills(*std::next(MI), StackSlot, Original); + if (IsRealSpill) + HSpiller.addToMergeableSpills(*std::next(MI), StackSlot, Original); } /// spillAroundUses - insert spill code around each use of Reg. @@ -887,20 +916,10 @@ void InlineSpiller::spillAroundUses(unsigned Reg) { // Debug values are not allowed to affect codegen. if (MI->isDebugValue()) { // Modify DBG_VALUE now that the value is in a spill slot. - bool IsIndirect = MI->isIndirectDebugValue(); - uint64_t Offset = IsIndirect ? MI->getOperand(1).getImm() : 0; - const MDNode *Var = MI->getDebugVariable(); - const MDNode *Expr = MI->getDebugExpression(); - DebugLoc DL = MI->getDebugLoc(); - DEBUG(dbgs() << "Modifying debug info due to spill:" << "\t" << *MI); MachineBasicBlock *MBB = MI->getParent(); - assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) && - "Expected inlined-at fields to agree"); - BuildMI(*MBB, MBB->erase(MI), DL, TII.get(TargetOpcode::DBG_VALUE)) - .addFrameIndex(StackSlot) - .addImm(Offset) - .addMetadata(Var) - .addMetadata(Expr); + DEBUG(dbgs() << "Modifying debug info due to spill:\t" << *MI); + buildDbgValueForSpill(*MBB, MI, *MI, StackSlot); + MBB->erase(MI); continue; } diff --git a/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp index ec35b3f..ee4929c 100644 --- a/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -45,6 +45,7 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/InstIterator.h" #include "llvm/Support/Debug.h" @@ -68,8 +69,7 @@ class InterleavedAccess : public FunctionPass { public: static char ID; - InterleavedAccess(const TargetMachine *TM = nullptr) - : FunctionPass(ID), DT(nullptr), TM(TM), TLI(nullptr) { + InterleavedAccess() : FunctionPass(ID), DT(nullptr), TLI(nullptr) { initializeInterleavedAccessPass(*PassRegistry::getPassRegistry()); } @@ -84,7 +84,6 @@ public: private: DominatorTree *DT; - const TargetMachine *TM; const TargetLowering *TLI; /// The maximum supported interleave factor. @@ -108,18 +107,16 @@ private: } // end anonymous namespace. char InterleavedAccess::ID = 0; -INITIALIZE_TM_PASS_BEGIN( - InterleavedAccess, "interleaved-access", +INITIALIZE_PASS_BEGIN(InterleavedAccess, DEBUG_TYPE, "Lower interleaved memory accesses to target specific intrinsics", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_TM_PASS_END( - InterleavedAccess, "interleaved-access", +INITIALIZE_PASS_END(InterleavedAccess, DEBUG_TYPE, "Lower interleaved memory accesses to target specific intrinsics", false, false) -FunctionPass *llvm::createInterleavedAccessPass(const TargetMachine *TM) { - return new InterleavedAccess(TM); +FunctionPass *llvm::createInterleavedAccessPass() { + return new InterleavedAccess(); } /// \brief Check if the mask is a DE-interleave mask of the given factor @@ -426,13 +423,15 @@ bool InterleavedAccess::lowerInterleavedStore( } bool InterleavedAccess::runOnFunction(Function &F) { - if (!TM || !LowerInterleavedAccesses) + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC || !LowerInterleavedAccesses) return false; DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n"); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - TLI = TM->getSubtargetImpl(F)->getTargetLowering(); + auto &TM = TPC->getTM<TargetMachine>(); + TLI = TM.getSubtargetImpl(F)->getTargetLowering(); MaxFactor = TLI->getMaxSupportedInterleaveFactor(); // Holds dead instructions that will be erased later. diff --git a/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp b/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp index afd2406..c6cc909 100644 --- a/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp +++ b/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp @@ -115,21 +115,21 @@ void IntrinsicLowering::AddPrototypes(Module &M) { Type::getInt8PtrTy(Context), Type::getInt8PtrTy(Context), Type::getInt8PtrTy(Context), - DL.getIntPtrType(Context), nullptr); + DL.getIntPtrType(Context)); break; case Intrinsic::memmove: M.getOrInsertFunction("memmove", Type::getInt8PtrTy(Context), Type::getInt8PtrTy(Context), Type::getInt8PtrTy(Context), - DL.getIntPtrType(Context), nullptr); + DL.getIntPtrType(Context)); break; case Intrinsic::memset: M.getOrInsertFunction("memset", Type::getInt8PtrTy(Context), Type::getInt8PtrTy(Context), Type::getInt32Ty(M.getContext()), - DL.getIntPtrType(Context), nullptr); + DL.getIntPtrType(Context)); break; case Intrinsic::sqrt: EnsureFPIntrinsicsExist(M, F, "sqrtf", "sqrt", "sqrtl"); diff --git a/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp b/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp index 26794e2..f2defb4 100644 --- a/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp +++ b/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp @@ -11,7 +11,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Target/TargetMachine.h" #include "llvm/Analysis/Passes.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/BasicTTIImpl.h" @@ -31,21 +30,11 @@ #include "llvm/Support/FormattedStream.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/Scalar.h" using namespace llvm; -// Enable or disable FastISel. Both options are needed, because -// FastISel is enabled by default with -fast, and we wish to be -// able to enable or disable fast-isel independently from -O0. -static cl::opt<cl::boolOrDefault> -EnableFastISelOption("fast-isel", cl::Hidden, - cl::desc("Enable the \"fast\" instruction selector")); - -static cl::opt<bool> - EnableGlobalISel("global-isel", cl::Hidden, cl::init(false), - cl::desc("Enable the \"global\" instruction selector")); - void LLVMTargetMachine::initAsmInfo() { MRI = TheTarget.createMCRegInfo(getTargetTriple().str()); MII = TheTarget.createMCInstrInfo(); @@ -71,8 +60,7 @@ void LLVMTargetMachine::initAsmInfo() { TmpAsmInfo->setPreserveAsmComments(Options.MCOptions.PreserveAsmComments); - if (Options.CompressDebugSections) - TmpAsmInfo->setCompressDebugSections(DebugCompressionType::DCT_ZlibGnu); + TmpAsmInfo->setCompressDebugSections(Options.CompressDebugSections); TmpAsmInfo->setRelaxELFRelocations(Options.RelaxELFRelocations); @@ -85,7 +73,7 @@ void LLVMTargetMachine::initAsmInfo() { LLVMTargetMachine::LLVMTargetMachine(const Target &T, StringRef DataLayoutString, const Triple &TT, StringRef CPU, - StringRef FS, TargetOptions Options, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) : TargetMachine(T, DataLayoutString, TT, CPU, FS, Options) { @@ -106,109 +94,31 @@ static MCContext * addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM, bool DisableVerify, AnalysisID StartBefore, AnalysisID StartAfter, AnalysisID StopBefore, - AnalysisID StopAfter, - MachineFunctionInitializer *MFInitializer = nullptr) { - - // When in emulated TLS mode, add the LowerEmuTLS pass. - if (TM->Options.EmulatedTLS) - PM.add(createLowerEmuTLSPass(TM)); - - PM.add(createPreISelIntrinsicLoweringPass()); - - // Add internal analysis passes from the target machine. - PM.add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis())); - + AnalysisID StopAfter) { // Targets may override createPassConfig to provide a target-specific // subclass. TargetPassConfig *PassConfig = TM->createPassConfig(PM); PassConfig->setStartStopPasses(StartBefore, StartAfter, StopBefore, StopAfter); - // Set PassConfig options provided by TargetMachine. PassConfig->setDisableVerify(DisableVerify); - PM.add(PassConfig); - - PassConfig->addIRPasses(); - - PassConfig->addCodeGenPrepare(); - - PassConfig->addPassesToHandleExceptions(); - - PassConfig->addISelPrepare(); - MachineModuleInfo *MMI = new MachineModuleInfo(TM); - MMI->setMachineFunctionInitializer(MFInitializer); PM.add(MMI); - // Enable FastISel with -fast, but allow that to be overridden. - TM->setO0WantsFastISel(EnableFastISelOption != cl::BOU_FALSE); - if (EnableFastISelOption == cl::BOU_TRUE || - (TM->getOptLevel() == CodeGenOpt::None && - TM->getO0WantsFastISel())) - TM->setFastISel(true); - - // Ask the target for an isel. - if (LLVM_UNLIKELY(EnableGlobalISel)) { - if (PassConfig->addIRTranslator()) - return nullptr; - - PassConfig->addPreLegalizeMachineIR(); - - if (PassConfig->addLegalizeMachineIR()) - return nullptr; - - // Before running the register bank selector, ask the target if it - // wants to run some passes. - PassConfig->addPreRegBankSelect(); - - if (PassConfig->addRegBankSelect()) - return nullptr; - - PassConfig->addPreGlobalInstructionSelect(); - - if (PassConfig->addGlobalInstructionSelect()) - return nullptr; - - // Pass to reset the MachineFunction if the ISel failed. - PM.add(createResetMachineFunctionPass( - PassConfig->reportDiagnosticWhenGlobalISelFallback())); - - // Provide a fallback path when we do not want to abort on - // not-yet-supported input. - if (LLVM_UNLIKELY(!PassConfig->isGlobalISelAbortEnabled()) && - PassConfig->addInstSelector()) - return nullptr; - - } else if (PassConfig->addInstSelector()) + if (PassConfig->addISelPasses()) return nullptr; - PassConfig->addMachinePasses(); - PassConfig->setInitialized(); return &MMI->getContext(); } -bool LLVMTargetMachine::addPassesToEmitFile( - PassManagerBase &PM, raw_pwrite_stream &Out, CodeGenFileType FileType, - bool DisableVerify, AnalysisID StartBefore, AnalysisID StartAfter, - AnalysisID StopBefore, AnalysisID StopAfter, - MachineFunctionInitializer *MFInitializer) { - // Add common CodeGen passes. - MCContext *Context = - addPassesToGenerateCode(this, PM, DisableVerify, StartBefore, StartAfter, - StopBefore, StopAfter, MFInitializer); - if (!Context) - return true; - - if (StopBefore || StopAfter) { - PM.add(createPrintMIRPass(Out)); - return false; - } - +bool LLVMTargetMachine::addAsmPrinter(PassManagerBase &PM, + raw_pwrite_stream &Out, CodeGenFileType FileType, + MCContext &Context) { if (Options.MCOptions.MCSaveTempLabels) - Context->setAllowTemporaryLabels(false); + Context.setAllowTemporaryLabels(false); const MCSubtargetInfo &STI = *getMCSubtargetInfo(); const MCAsmInfo &MAI = *getMCAsmInfo(); @@ -225,14 +135,14 @@ bool LLVMTargetMachine::addPassesToEmitFile( // Create a code emitter if asked to show the encoding. MCCodeEmitter *MCE = nullptr; if (Options.MCOptions.ShowMCEncoding) - MCE = getTarget().createMCCodeEmitter(MII, MRI, *Context); + MCE = getTarget().createMCCodeEmitter(MII, MRI, Context); MCAsmBackend *MAB = getTarget().createMCAsmBackend(MRI, getTargetTriple().str(), TargetCPU, Options.MCOptions); auto FOut = llvm::make_unique<formatted_raw_ostream>(Out); MCStreamer *S = getTarget().createAsmStreamer( - *Context, std::move(FOut), Options.MCOptions.AsmVerbose, + Context, std::move(FOut), Options.MCOptions.AsmVerbose, Options.MCOptions.MCUseDwarfDirectory, InstPrinter, MCE, MAB, Options.MCOptions.ShowMCInst); AsmStreamer.reset(S); @@ -241,7 +151,7 @@ bool LLVMTargetMachine::addPassesToEmitFile( case CGFT_ObjectFile: { // Create the code emitter for the target if it exists. If not, .o file // emission fails. - MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(MII, MRI, *Context); + MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(MII, MRI, Context); MCAsmBackend *MAB = getTarget().createMCAsmBackend(MRI, getTargetTriple().str(), TargetCPU, Options.MCOptions); @@ -249,11 +159,11 @@ bool LLVMTargetMachine::addPassesToEmitFile( return true; // Don't waste memory on names of temp labels. - Context->setUseNamesOnTempLabels(false); + Context.setUseNamesOnTempLabels(false); Triple T(getTargetTriple().str()); AsmStreamer.reset(getTarget().createMCObjectStreamer( - T, *Context, *MAB, Out, MCE, STI, Options.MCOptions.MCRelaxAll, + T, Context, *MAB, Out, MCE, STI, Options.MCOptions.MCRelaxAll, Options.MCOptions.MCIncrementalLinkerCompatible, /*DWARFMustBeAtTheEnd*/ true)); break; @@ -261,7 +171,7 @@ bool LLVMTargetMachine::addPassesToEmitFile( case CGFT_Null: // The Null output is intended for use for performance analysis and testing, // not real users. - AsmStreamer.reset(getTarget().createNullStreamer(*Context)); + AsmStreamer.reset(getTarget().createNullStreamer(Context)); break; } @@ -272,8 +182,28 @@ bool LLVMTargetMachine::addPassesToEmitFile( return true; PM.add(Printer); - PM.add(createFreeMachineFunctionPass()); + return false; +} +bool LLVMTargetMachine::addPassesToEmitFile( + PassManagerBase &PM, raw_pwrite_stream &Out, CodeGenFileType FileType, + bool DisableVerify, AnalysisID StartBefore, AnalysisID StartAfter, + AnalysisID StopBefore, AnalysisID StopAfter) { + // Add common CodeGen passes. + MCContext *Context = + addPassesToGenerateCode(this, PM, DisableVerify, StartBefore, StartAfter, + StopBefore, StopAfter); + if (!Context) + return true; + + if (StopBefore || StopAfter) { + PM.add(createPrintMIRPass(Out)); + } else { + if (addAsmPrinter(PM, Out, FileType, *Context)) + return true; + } + + PM.add(createFreeMachineFunctionPass()); return false; } diff --git a/contrib/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp b/contrib/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp new file mode 100644 index 0000000..996d40c --- /dev/null +++ b/contrib/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp @@ -0,0 +1,97 @@ +///===- LazyMachineBlockFrequencyInfo.cpp - Lazy Machine Block Frequency --===// +/// +/// The LLVM Compiler Infrastructure +/// +/// This file is distributed under the University of Illinois Open Source +/// License. See LICENSE.TXT for details. +/// +///===---------------------------------------------------------------------===// +/// \file +/// This is an alternative analysis pass to MachineBlockFrequencyInfo. The +/// difference is that with this pass the block frequencies are not computed +/// when the analysis pass is executed but rather when the BFI result is +/// explicitly requested by the analysis client. +/// +///===---------------------------------------------------------------------===// + +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "lazy-machine-block-freq" + +INITIALIZE_PASS_BEGIN(LazyMachineBlockFrequencyInfoPass, DEBUG_TYPE, + "Lazy Machine Block Frequency Analysis", true, true) +INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_END(LazyMachineBlockFrequencyInfoPass, DEBUG_TYPE, + "Lazy Machine Block Frequency Analysis", true, true) + +char LazyMachineBlockFrequencyInfoPass::ID = 0; + +LazyMachineBlockFrequencyInfoPass::LazyMachineBlockFrequencyInfoPass() + : MachineFunctionPass(ID) { + initializeLazyMachineBlockFrequencyInfoPassPass( + *PassRegistry::getPassRegistry()); +} + +void LazyMachineBlockFrequencyInfoPass::print(raw_ostream &OS, + const Module *M) const { + getBFI().print(OS, M); +} + +void LazyMachineBlockFrequencyInfoPass::getAnalysisUsage( + AnalysisUsage &AU) const { + AU.addRequired<MachineBranchProbabilityInfo>(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +void LazyMachineBlockFrequencyInfoPass::releaseMemory() { + OwnedMBFI.reset(); + OwnedMLI.reset(); + OwnedMDT.reset(); +} + +MachineBlockFrequencyInfo & +LazyMachineBlockFrequencyInfoPass::calculateIfNotAvailable() const { + auto *MBFI = getAnalysisIfAvailable<MachineBlockFrequencyInfo>(); + if (MBFI) { + DEBUG(dbgs() << "MachineBlockFrequencyInfo is available\n"); + return *MBFI; + } + + auto &MBPI = getAnalysis<MachineBranchProbabilityInfo>(); + auto *MLI = getAnalysisIfAvailable<MachineLoopInfo>(); + auto *MDT = getAnalysisIfAvailable<MachineDominatorTree>(); + DEBUG(dbgs() << "Building MachineBlockFrequencyInfo on the fly\n"); + DEBUG(if (MLI) dbgs() << "LoopInfo is available\n"); + + if (!MLI) { + DEBUG(dbgs() << "Building LoopInfo on the fly\n"); + // First create a dominator tree. + DEBUG(if (MDT) dbgs() << "DominatorTree is available\n"); + + if (!MDT) { + DEBUG(dbgs() << "Building DominatorTree on the fly\n"); + OwnedMDT = make_unique<MachineDominatorTree>(); + OwnedMDT->getBase().recalculate(*MF); + MDT = OwnedMDT.get(); + } + + // Generate LoopInfo from it. + OwnedMLI = make_unique<MachineLoopInfo>(); + OwnedMLI->getBase().analyze(MDT->getBase()); + MLI = OwnedMLI.get(); + } + + OwnedMBFI = make_unique<MachineBlockFrequencyInfo>(); + OwnedMBFI->calculate(*MF, MBPI, *MLI); + return *OwnedMBFI.get(); +} + +bool LazyMachineBlockFrequencyInfoPass::runOnMachineFunction( + MachineFunction &F) { + MF = &F; + return false; +} diff --git a/contrib/llvm/lib/CodeGen/LexicalScopes.cpp b/contrib/llvm/lib/CodeGen/LexicalScopes.cpp index 834ed5f..995c58a 100644 --- a/contrib/llvm/lib/CodeGen/LexicalScopes.cpp +++ b/contrib/llvm/lib/CodeGen/LexicalScopes.cpp @@ -15,13 +15,22 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/LexicalScopes.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" -#include "llvm/IR/DebugInfo.h" -#include "llvm/IR/Function.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/Metadata.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/raw_ostream.h" +#include <cassert> +#include <string> +#include <tuple> +#include <utility> + using namespace llvm; #define DEBUG_TYPE "lexicalscopes" @@ -38,6 +47,10 @@ void LexicalScopes::reset() { /// initialize - Scan machine function and constuct lexical scope nest. void LexicalScopes::initialize(const MachineFunction &Fn) { + // Don't attempt any lexical scope creation for a NoDebug compile unit. + if (Fn.getFunction()->getSubprogram()->getUnit()->getEmissionKind() == + DICompileUnit::NoDebug) + return; reset(); MF = &Fn; SmallVector<InsnRange, 4> MIRanges; @@ -54,7 +67,6 @@ void LexicalScopes::initialize(const MachineFunction &Fn) { void LexicalScopes::extractLexicalScopes( SmallVectorImpl<InsnRange> &MIRanges, DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap) { - // Scan each instruction and create scopes. First build working set of scopes. for (const auto &MBB : *MF) { const MachineInstr *RangeBeginMI = nullptr; @@ -74,8 +86,9 @@ void LexicalScopes::extractLexicalScopes( continue; } - // Ignore DBG_VALUE. It does not contribute to any instruction in output. - if (MInsn.isDebugValue()) + // Ignore DBG_VALUE and similar instruction that do not contribute to any + // instruction in the output. + if (MInsn.isMetaInstruction()) continue; if (RangeBeginMI) { @@ -127,6 +140,10 @@ LexicalScope *LexicalScopes::findLexicalScope(const DILocation *DL) { LexicalScope *LexicalScopes::getOrCreateLexicalScope(const DILocalScope *Scope, const DILocation *IA) { if (IA) { + // Skip scopes inlined from a NoDebug compile unit. + if (Scope->getSubprogram()->getUnit()->getEmissionKind() == + DICompileUnit::NoDebug) + return getOrCreateLexicalScope(IA); // Create an abstract scope for inlined function. getOrCreateAbstractScope(Scope); // Create an inlined scope for inlined function. @@ -181,10 +198,9 @@ LexicalScopes::getOrCreateInlinedScope(const DILocalScope *Scope, else Parent = getOrCreateLexicalScope(InlinedAt); - I = InlinedLexicalScopeMap.emplace(std::piecewise_construct, - std::forward_as_tuple(P), - std::forward_as_tuple(Parent, Scope, - InlinedAt, false)) + I = InlinedLexicalScopeMap + .emplace(std::piecewise_construct, std::forward_as_tuple(P), + std::forward_as_tuple(Parent, Scope, InlinedAt, false)) .first; return &I->second; } @@ -241,7 +257,6 @@ void LexicalScopes::constructScopeNest(LexicalScope *Scope) { void LexicalScopes::assignInstructionRanges( SmallVectorImpl<InsnRange> &MIRanges, DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap) { - LexicalScope *PrevLexicalScope = nullptr; for (const auto &R : MIRanges) { LexicalScope *S = MI2ScopeMap.lookup(R.first); @@ -299,9 +314,8 @@ bool LexicalScopes::dominates(const DILocation *DL, MachineBasicBlock *MBB) { return Result; } -/// dump - Print data structures. -void LexicalScope::dump(unsigned Indent) const { -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LexicalScope::dump(unsigned Indent) const { raw_ostream &err = dbgs(); err.indent(Indent); err << "DFSIn: " << DFSIn << " DFSOut: " << DFSOut << "\n"; @@ -316,5 +330,5 @@ void LexicalScope::dump(unsigned Indent) const { for (unsigned i = 0, e = Children.size(); i != e; ++i) if (Children[i] != this) Children[i]->dump(Indent + 2); -#endif } +#endif diff --git a/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp b/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp index c945376..b5e705f 100644 --- a/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp +++ b/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp @@ -24,13 +24,16 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/UniqueVector.h" #include "llvm/CodeGen/LexicalScopes.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/DebugInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetRegisterInfo.h" @@ -40,7 +43,7 @@ using namespace llvm; -#define DEBUG_TYPE "live-debug-values" +#define DEBUG_TYPE "livedebugvalues" STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted"); @@ -61,6 +64,7 @@ class LiveDebugValues : public MachineFunctionPass { private: const TargetRegisterInfo *TRI; const TargetInstrInfo *TII; + const TargetFrameLowering *TFI; LexicalScopes LS; /// Keeps track of lexical scopes associated with a user value's source @@ -127,11 +131,13 @@ private: if (int RegNo = isDbgValueDescribedByReg(MI)) { Kind = RegisterKind; Loc.RegisterLoc.RegNo = RegNo; - uint64_t Offset = + int64_t Offset = MI.isIndirectDebugValue() ? MI.getOperand(1).getImm() : 0; // We don't support offsets larger than 4GiB here. They are // slated to be replaced with DIExpressions anyway. - if (Offset >= (1ULL << 32)) + // With indirect debug values used for spill locations, Offset + // can be negative. + if (Offset == INT64_MIN || std::abs(Offset) >= (1LL << 32)) Kind = InvalidKind; else Loc.RegisterLoc.Offset = Offset; @@ -150,7 +156,9 @@ private: /// dominates MBB. bool dominates(MachineBasicBlock &MBB) const { return UVS.dominates(&MBB); } - void dump() const { MI.dump(); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD void dump() const { MI.dump(); } +#endif bool operator==(const VarLoc &Other) const { return Var == Other.Var && Loc.Hash == Other.Loc.Hash; @@ -167,6 +175,11 @@ private: typedef UniqueVector<VarLoc> VarLocMap; typedef SparseBitVector<> VarLocSet; typedef SmallDenseMap<const MachineBasicBlock *, VarLocSet> VarLocInMBB; + struct SpillDebugPair { + MachineInstr *SpillInst; + MachineInstr *DebugInst; + }; + typedef SmallVector<SpillDebugPair, 4> SpillMap; /// This holds the working set of currently open ranges. For fast /// access, this is done both as a set of VarLocIDs, and a map of @@ -216,14 +229,21 @@ private: } }; + bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF, + unsigned &Reg); + int extractSpillBaseRegAndOffset(const MachineInstr &MI, unsigned &Reg); + void transferDebugValue(const MachineInstr &MI, OpenRangesSet &OpenRanges, VarLocMap &VarLocIDs); + void transferSpillInst(MachineInstr &MI, OpenRangesSet &OpenRanges, + VarLocMap &VarLocIDs, SpillMap &Spills); void transferRegisterDef(MachineInstr &MI, OpenRangesSet &OpenRanges, const VarLocMap &VarLocIDs); bool transferTerminatorInst(MachineInstr &MI, OpenRangesSet &OpenRanges, VarLocInMBB &OutLocs, const VarLocMap &VarLocIDs); bool transfer(MachineInstr &MI, OpenRangesSet &OpenRanges, - VarLocInMBB &OutLocs, VarLocMap &VarLocIDs); + VarLocInMBB &OutLocs, VarLocMap &VarLocIDs, SpillMap &Spills, + bool transferSpills); bool join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs, const VarLocMap &VarLocIDs, @@ -263,7 +283,7 @@ public: char LiveDebugValues::ID = 0; char &llvm::LiveDebugValuesID = LiveDebugValues::ID; -INITIALIZE_PASS(LiveDebugValues, "livedebugvalues", "Live DEBUG_VALUE analysis", +INITIALIZE_PASS(LiveDebugValues, DEBUG_TYPE, "Live DEBUG_VALUE analysis", false, false) /// Default construct and initialize the pass. @@ -282,6 +302,7 @@ void LiveDebugValues::getAnalysisUsage(AnalysisUsage &AU) const { // Debug Range Extension Implementation //===----------------------------------------------------------------------===// +#ifndef NDEBUG void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF, const VarLocInMBB &V, const VarLocMap &VarLocIDs, @@ -300,6 +321,22 @@ void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF, } Out << "\n"; } +#endif + +/// Given a spill instruction, extract the register and offset used to +/// address the spill location in a target independent way. +int LiveDebugValues::extractSpillBaseRegAndOffset(const MachineInstr &MI, + unsigned &Reg) { + assert(MI.hasOneMemOperand() && + "Spill instruction does not have exactly one memory operand?"); + auto MMOI = MI.memoperands_begin(); + const PseudoSourceValue *PVal = (*MMOI)->getPseudoValue(); + assert(PVal->kind() == PseudoSourceValue::FixedStack && + "Inconsistent memory operand in spill instruction"); + int FI = cast<FixedStackPseudoSourceValue>(PVal)->getFrameIndex(); + const MachineBasicBlock *MBB = MI.getParent(); + return TFI->getFrameIndexReference(*MBB->getParent(), FI, Reg); +} /// End all previous ranges related to @MI and start a new range from @MI /// if it is a DBG_VALUE instr. @@ -336,8 +373,12 @@ void LiveDebugValues::transferRegisterDef(MachineInstr &MI, unsigned SP = TLI->getStackPointerRegisterToSaveRestore(); SparseBitVector<> KillSet; for (const MachineOperand &MO : MI.operands()) { + // Determine whether the operand is a register def. Assume that call + // instructions never clobber SP, because some backends (e.g., AArch64) + // never list SP in the regmask. if (MO.isReg() && MO.isDef() && MO.getReg() && - TRI->isPhysicalRegister(MO.getReg())) { + TRI->isPhysicalRegister(MO.getReg()) && + !(MI.isCall() && MO.getReg() == SP)) { // Remove ranges of all aliased registers. for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI) for (unsigned ID : OpenRanges.getVarLocs()) @@ -358,6 +399,91 @@ void LiveDebugValues::transferRegisterDef(MachineInstr &MI, OpenRanges.erase(KillSet, VarLocIDs); } +/// Decide if @MI is a spill instruction and return true if it is. We use 2 +/// criteria to make this decision: +/// - Is this instruction a store to a spill slot? +/// - Is there a register operand that is both used and killed? +/// TODO: Store optimization can fold spills into other stores (including +/// other spills). We do not handle this yet (more than one memory operand). +bool LiveDebugValues::isSpillInstruction(const MachineInstr &MI, + MachineFunction *MF, unsigned &Reg) { + const MachineFrameInfo &FrameInfo = MF->getFrameInfo(); + int FI; + const MachineMemOperand *MMO; + + // TODO: Handle multiple stores folded into one. + if (!MI.hasOneMemOperand()) + return false; + + // To identify a spill instruction, use the same criteria as in AsmPrinter. + if (!((TII->isStoreToStackSlotPostFE(MI, FI) || + TII->hasStoreToStackSlot(MI, MMO, FI)) && + FrameInfo.isSpillSlotObjectIndex(FI))) + return false; + + // In a spill instruction generated by the InlineSpiller the spilled register + // has its kill flag set. Return false if we don't find such a register. + Reg = 0; + for (const MachineOperand &MO : MI.operands()) { + if (MO.isReg() && MO.isUse() && MO.isKill()) { + Reg = MO.getReg(); + break; + } + } + return Reg != 0; +} + +/// A spilled register may indicate that we have to end the current range of +/// a variable and create a new one for the spill location. +/// We don't want to insert any instructions in transfer(), so we just create +/// the DBG_VALUE witout inserting it and keep track of it in @Spills. +/// It will be inserted into the BB when we're done iterating over the +/// instructions. +void LiveDebugValues::transferSpillInst(MachineInstr &MI, + OpenRangesSet &OpenRanges, + VarLocMap &VarLocIDs, + SpillMap &Spills) { + unsigned Reg; + MachineFunction *MF = MI.getParent()->getParent(); + if (!isSpillInstruction(MI, MF, Reg)) + return; + + // Check if the register is the location of a debug value. + for (unsigned ID : OpenRanges.getVarLocs()) { + if (VarLocIDs[ID].isDescribedByReg() == Reg) { + DEBUG(dbgs() << "Spilling Register " << PrintReg(Reg, TRI) << '(' + << VarLocIDs[ID].Var.getVar()->getName() << ")\n"); + + // Create a DBG_VALUE instruction to describe the Var in its spilled + // location, but don't insert it yet to avoid invalidating the + // iterator in our caller. + unsigned SpillBase; + int SpillOffset = extractSpillBaseRegAndOffset(MI, SpillBase); + const MachineInstr *DMI = &VarLocIDs[ID].MI; + MachineInstr *SpDMI = + BuildMI(*MF, DMI->getDebugLoc(), DMI->getDesc(), true, SpillBase, 0, + DMI->getDebugVariable(), DMI->getDebugExpression()); + SpDMI->getOperand(1).setImm(SpillOffset); + DEBUG(dbgs() << "Creating DBG_VALUE inst for spill: "; + SpDMI->print(dbgs(), false, TII)); + + // The newly created DBG_VALUE instruction SpDMI must be inserted after + // MI. Keep track of the pairing. + SpillDebugPair MIP = {&MI, SpDMI}; + Spills.push_back(MIP); + + // End all previous ranges of Var. + OpenRanges.erase(VarLocIDs[ID].Var); + + // Add the VarLoc to OpenRanges. + VarLoc VL(*SpDMI, LS); + unsigned SpillLocID = VarLocIDs.insert(VL); + OpenRanges.insert(SpillLocID, VL.Var); + return; + } + } +} + /// Terminate all open ranges at the end of the current basic block. bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI, OpenRangesSet &OpenRanges, @@ -383,10 +509,13 @@ bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI, /// This routine creates OpenRanges and OutLocs. bool LiveDebugValues::transfer(MachineInstr &MI, OpenRangesSet &OpenRanges, - VarLocInMBB &OutLocs, VarLocMap &VarLocIDs) { + VarLocInMBB &OutLocs, VarLocMap &VarLocIDs, + SpillMap &Spills, bool transferSpills) { bool Changed = false; transferDebugValue(MI, OpenRanges, VarLocIDs); transferRegisterDef(MI, OpenRanges, VarLocIDs); + if (transferSpills) + transferSpillInst(MI, OpenRanges, VarLocIDs, Spills); Changed = transferTerminatorInst(MI, OpenRanges, OutLocs, VarLocIDs); return Changed; } @@ -475,10 +604,11 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) { bool OLChanged = false; bool MBBJoined = false; - VarLocMap VarLocIDs; // Map VarLoc<>unique ID for use in bitvectors. + VarLocMap VarLocIDs; // Map VarLoc<>unique ID for use in bitvectors. OpenRangesSet OpenRanges; // Ranges that are open until end of bb. - VarLocInMBB OutLocs; // Ranges that exist beyond bb. - VarLocInMBB InLocs; // Ranges that are incoming after joining. + VarLocInMBB OutLocs; // Ranges that exist beyond bb. + VarLocInMBB InLocs; // Ranges that are incoming after joining. + SpillMap Spills; // DBG_VALUEs associated with spills. DenseMap<unsigned int, MachineBasicBlock *> OrderToBB; DenseMap<MachineBasicBlock *, unsigned int> BBToOrder; @@ -490,9 +620,14 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) { Pending; // Initialize every mbb with OutLocs. + // We are not looking at any spill instructions during the initial pass + // over the BBs. The LiveDebugVariables pass has already created DBG_VALUE + // instructions for spills of registers that are known to be user variables + // within the BB in which the spill occurs. for (auto &MBB : MF) for (auto &MI : MBB) - transfer(MI, OpenRanges, OutLocs, VarLocIDs); + transfer(MI, OpenRanges, OutLocs, VarLocIDs, Spills, + /*transferSpills=*/false); DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "OutLocs after initialization", dbgs())); @@ -524,8 +659,18 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) { if (MBBJoined) { MBBJoined = false; Changed = true; + // Now that we have started to extend ranges across BBs we need to + // examine spill instructions to see whether they spill registers that + // correspond to user variables. for (auto &MI : *MBB) - OLChanged |= transfer(MI, OpenRanges, OutLocs, VarLocIDs); + OLChanged |= transfer(MI, OpenRanges, OutLocs, VarLocIDs, Spills, + /*transferSpills=*/true); + + // Add any DBG_VALUE instructions necessitated by spills. + for (auto &SP : Spills) + MBB->insertAfter(MachineBasicBlock::iterator(*SP.SpillInst), + SP.DebugInst); + Spills.clear(); DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "OutLocs after propagating", dbgs())); @@ -559,6 +704,7 @@ bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) { TRI = MF.getSubtarget().getRegisterInfo(); TII = MF.getSubtarget().getInstrInfo(); + TFI = MF.getSubtarget().getFrameLowering(); LS.initialize(MF); bool Changed = ExtendRanges(MF); diff --git a/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp b/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp index 0934d8c..0c76478 100644 --- a/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp +++ b/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp @@ -45,7 +45,7 @@ using namespace llvm; -#define DEBUG_TYPE "livedebug" +#define DEBUG_TYPE "livedebugvars" static cl::opt<bool> EnableLDV("live-debug-variables", cl::init(true), @@ -54,11 +54,11 @@ EnableLDV("live-debug-variables", cl::init(true), STATISTIC(NumInsertedDebugValues, "Number of DBG_VALUEs inserted"); char LiveDebugVariables::ID = 0; -INITIALIZE_PASS_BEGIN(LiveDebugVariables, "livedebugvars", +INITIALIZE_PASS_BEGIN(LiveDebugVariables, DEBUG_TYPE, "Debug Variable Analysis", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_END(LiveDebugVariables, "livedebugvars", +INITIALIZE_PASS_END(LiveDebugVariables, DEBUG_TYPE, "Debug Variable Analysis", false, false) void LiveDebugVariables::getAnalysisUsage(AnalysisUsage &AU) const { @@ -944,7 +944,7 @@ void UserValue::insertDebugValue(MachineBasicBlock *MBB, SlotIndex Idx, IsIndirect, Loc.getReg(), offset, Variable, Expression); else BuildMI(*MBB, I, getDebugLoc(), TII.get(TargetOpcode::DBG_VALUE)) - .addOperand(Loc) + .add(Loc) .addImm(offset) .addMetadata(Variable) .addMetadata(Expression); @@ -1005,8 +1005,8 @@ bool LiveDebugVariables::doInitialization(Module &M) { return Pass::doInitialization(M); } -#ifndef NDEBUG -LLVM_DUMP_METHOD void LiveDebugVariables::dump() { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LiveDebugVariables::dump() const { if (pImpl) static_cast<LDVImpl*>(pImpl)->print(dbgs()); } diff --git a/contrib/llvm/lib/CodeGen/LiveDebugVariables.h b/contrib/llvm/lib/CodeGen/LiveDebugVariables.h index afe87a5..1d7e3d4 100644 --- a/contrib/llvm/lib/CodeGen/LiveDebugVariables.h +++ b/contrib/llvm/lib/CodeGen/LiveDebugVariables.h @@ -59,7 +59,7 @@ public: void emitDebugValues(VirtRegMap *VRM); /// dump - Print data structures to dbgs(). - void dump(); + void dump() const; private: diff --git a/contrib/llvm/lib/CodeGen/LiveInterval.cpp b/contrib/llvm/lib/CodeGen/LiveInterval.cpp index 623af49..9ef9f23 100644 --- a/contrib/llvm/lib/CodeGen/LiveInterval.cpp +++ b/contrib/llvm/lib/CodeGen/LiveInterval.cpp @@ -863,6 +863,37 @@ void LiveInterval::clearSubRanges() { SubRanges = nullptr; } +void LiveInterval::refineSubRanges(BumpPtrAllocator &Allocator, + LaneBitmask LaneMask, std::function<void(LiveInterval::SubRange&)> Apply) { + + LaneBitmask ToApply = LaneMask; + for (SubRange &SR : subranges()) { + LaneBitmask SRMask = SR.LaneMask; + LaneBitmask Matching = SRMask & LaneMask; + if (Matching.none()) + continue; + + SubRange *MatchingRange; + if (SRMask == Matching) { + // The subrange fits (it does not cover bits outside \p LaneMask). + MatchingRange = &SR; + } else { + // We have to split the subrange into a matching and non-matching part. + // Reduce lanemask of existing lane to non-matching part. + SR.LaneMask = SRMask & ~Matching; + // Create a new subrange for the matching part + MatchingRange = createSubRangeFrom(Allocator, Matching, SR); + } + Apply(*MatchingRange); + ToApply &= ~Matching; + } + // Create a new subrange if there are uncovered bits left. + if (ToApply.any()) { + SubRange *NewRange = createSubRange(Allocator, ToApply); + Apply(*NewRange); + } +} + unsigned LiveInterval::getSize() const { unsigned Sum = 0; for (const Segment &S : segments) @@ -1032,6 +1063,7 @@ void LiveInterval::verify(const MachineRegisterInfo *MRI) const { // When they exist, Spills.back().start <= LastStart, // and WriteI[-1].start <= LastStart. +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void LiveRangeUpdater::print(raw_ostream &OS) const { if (!isDirty()) { if (LR) @@ -1058,6 +1090,7 @@ void LiveRangeUpdater::print(raw_ostream &OS) const { LLVM_DUMP_METHOD void LiveRangeUpdater::dump() const { print(errs()); } +#endif // Determine if A and B should be coalesced. static inline bool coalescable(const LiveRange::Segment &A, diff --git a/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp b/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp index 70d3483..471dcea 100644 --- a/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp +++ b/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp @@ -1,4 +1,4 @@ -//===-- LiveIntervalAnalysis.cpp - Live Interval Analysis -----------------===// +//===- LiveIntervalAnalysis.cpp - Live Interval Analysis ------------------===// // // The LLVM Compiler Infrastructure // @@ -7,35 +7,52 @@ // //===----------------------------------------------------------------------===// // -// This file implements the LiveInterval analysis pass which is used -// by the Linear Scan Register allocator. This pass linearizes the -// basic blocks of the function in DFS order and computes live intervals for -// each virtual and physical register. +/// \file This file implements the LiveInterval analysis pass which is used +/// by the Linear Scan Register allocator. This pass linearizes the +/// basic blocks of the function in DFS order and computes live intervals for +/// each virtual and physical register. // //===----------------------------------------------------------------------===// #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "LiveRangeCalc.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBundle.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/VirtRegMap.h" -#include "llvm/IR/Value.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <algorithm> -#include <cmath> +#include <cassert> +#include <cstdint> +#include <iterator> +#include <tuple> +#include <utility> + using namespace llvm; #define DEBUG_TYPE "regalloc" @@ -59,11 +76,13 @@ static bool EnablePrecomputePhysRegs = false; #endif // NDEBUG namespace llvm { + cl::opt<bool> UseSegmentSetForPhysRegs( "use-segment-set-for-physregs", cl::Hidden, cl::init(true), cl::desc( "Use segment set for the computation of the live ranges of physregs.")); -} + +} // end namespace llvm void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); @@ -78,8 +97,7 @@ void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } -LiveIntervals::LiveIntervals() : MachineFunctionPass(ID), - DomTree(nullptr), LRCalc(nullptr) { +LiveIntervals::LiveIntervals() : MachineFunctionPass(ID) { initializeLiveIntervalsPass(*PassRegistry::getPassRegistry()); } @@ -96,16 +114,14 @@ void LiveIntervals::releaseMemory() { RegMaskBits.clear(); RegMaskBlocks.clear(); - for (unsigned i = 0, e = RegUnitRanges.size(); i != e; ++i) - delete RegUnitRanges[i]; + for (LiveRange *LR : RegUnitRanges) + delete LR; RegUnitRanges.clear(); // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd. VNInfoAllocator.Reset(); } -/// runOnMachineFunction - calculates LiveIntervals -/// bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) { MF = &fn; MRI = &MF->getRegInfo(); @@ -135,14 +151,13 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) { return true; } -/// print - Implement the dump method. void LiveIntervals::print(raw_ostream &OS, const Module* ) const { OS << "********** INTERVALS **********\n"; // Dump the regunits. - for (unsigned i = 0, e = RegUnitRanges.size(); i != e; ++i) - if (LiveRange *LR = RegUnitRanges[i]) - OS << PrintRegUnit(i, TRI) << ' ' << *LR << '\n'; + for (unsigned Unit = 0, UnitE = RegUnitRanges.size(); Unit != UnitE; ++Unit) + if (LiveRange *LR = RegUnitRanges[Unit]) + OS << PrintRegUnit(Unit, TRI) << ' ' << *LR << '\n'; // Dump the virtregs. for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { @@ -152,8 +167,8 @@ void LiveIntervals::print(raw_ostream &OS, const Module* ) const { } OS << "RegMasks:"; - for (unsigned i = 0, e = RegMaskSlots.size(); i != e; ++i) - OS << ' ' << RegMaskSlots[i]; + for (SlotIndex Idx : RegMaskSlots) + OS << ' ' << Idx; OS << '\n'; printInstrs(OS); @@ -165,20 +180,17 @@ void LiveIntervals::printInstrs(raw_ostream &OS) const { } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void LiveIntervals::dumpInstrs() const { +LLVM_DUMP_METHOD void LiveIntervals::dumpInstrs() const { printInstrs(dbgs()); } #endif LiveInterval* LiveIntervals::createInterval(unsigned reg) { - float Weight = TargetRegisterInfo::isPhysicalRegister(reg) ? - llvm::huge_valf : 0.0F; + float Weight = TargetRegisterInfo::isPhysicalRegister(reg) ? huge_valf : 0.0F; return new LiveInterval(reg, Weight); } - -/// computeVirtRegInterval - Compute the live interval of a virtual register, -/// based on defs and uses. +/// Compute the live interval of a virtual register, based on defs and uses. void LiveIntervals::computeVirtRegInterval(LiveInterval &LI) { assert(LRCalc && "LRCalc not initialized."); assert(LI.empty() && "Should only compute empty intervals."); @@ -200,7 +212,7 @@ void LiveIntervals::computeRegMasks() { RegMaskBlocks.resize(MF->getNumBlockIDs()); // Find all instructions with regmask operands. - for (MachineBasicBlock &MBB : *MF) { + for (const MachineBasicBlock &MBB : *MF) { std::pair<unsigned, unsigned> &RMB = RegMaskBlocks[MBB.getNumber()]; RMB.first = RegMaskSlots.size(); @@ -210,7 +222,7 @@ void LiveIntervals::computeRegMasks() { RegMaskBits.push_back(Mask); } - for (MachineInstr &MI : MBB) { + for (const MachineInstr &MI : MBB) { for (const MachineOperand &MO : MI.operands()) { if (!MO.isRegMask()) continue; @@ -245,9 +257,9 @@ void LiveIntervals::computeRegMasks() { // interference. // -/// computeRegUnitInterval - Compute the live range of a register unit, based -/// on the uses and defs of aliasing registers. The range should be empty, -/// or contain only dead phi-defs from ABI blocks. +/// Compute the live range of a register unit, based on the uses and defs of +/// aliasing registers. The range should be empty, or contain only dead +/// phi-defs from ABI blocks. void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) { assert(LRCalc && "LRCalc not initialized."); LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator()); @@ -257,22 +269,30 @@ void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) { // may share super-registers. That's OK because createDeadDefs() is // idempotent. It is very rare for a register unit to have multiple roots, so // uniquing super-registers is probably not worthwhile. - for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) { - for (MCSuperRegIterator Supers(*Roots, TRI, /*IncludeSelf=*/true); - Supers.isValid(); ++Supers) { - if (!MRI->reg_empty(*Supers)) - LRCalc->createDeadDefs(LR, *Supers); + bool IsReserved = true; + for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) { + for (MCSuperRegIterator Super(*Root, TRI, /*IncludeSelf=*/true); + Super.isValid(); ++Super) { + unsigned Reg = *Super; + if (!MRI->reg_empty(Reg)) + LRCalc->createDeadDefs(LR, Reg); + // A register unit is considered reserved if all its roots and all their + // super registers are reserved. + if (!MRI->isReserved(Reg)) + IsReserved = false; } } // Now extend LR to reach all uses. // Ignore uses of reserved registers. We only track defs of those. - for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) { - for (MCSuperRegIterator Supers(*Roots, TRI, /*IncludeSelf=*/true); - Supers.isValid(); ++Supers) { - unsigned Reg = *Supers; - if (!MRI->isReserved(Reg) && !MRI->reg_empty(Reg)) - LRCalc->extendToUses(LR, Reg); + if (!IsReserved) { + for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) { + for (MCSuperRegIterator Super(*Root, TRI, /*IncludeSelf=*/true); + Super.isValid(); ++Super) { + unsigned Reg = *Super; + if (!MRI->reg_empty(Reg)) + LRCalc->extendToUses(LR, Reg); + } } } @@ -281,11 +301,9 @@ void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) { LR.flushSegmentSet(); } - -/// computeLiveInRegUnits - Precompute the live ranges of any register units -/// that are live-in to an ABI block somewhere. Register values can appear -/// without a corresponding def when entering the entry block or a landing pad. -/// +/// Precompute the live ranges of any register units that are live-in to an ABI +/// block somewhere. Register values can appear without a corresponding def when +/// entering the entry block or a landing pad. void LiveIntervals::computeLiveInRegUnits() { RegUnitRanges.resize(TRI->getNumRegUnits()); DEBUG(dbgs() << "Computing live-in reg-units in ABI blocks.\n"); @@ -294,18 +312,15 @@ void LiveIntervals::computeLiveInRegUnits() { SmallVector<unsigned, 8> NewRanges; // Check all basic blocks for live-ins. - for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end(); - MFI != MFE; ++MFI) { - const MachineBasicBlock *MBB = &*MFI; - + for (const MachineBasicBlock &MBB : *MF) { // We only care about ABI blocks: Entry + landing pads. - if ((MFI != MF->begin() && !MBB->isEHPad()) || MBB->livein_empty()) + if ((&MBB != &MF->front() && !MBB.isEHPad()) || MBB.livein_empty()) continue; // Create phi-defs at Begin for all live-in registers. - SlotIndex Begin = Indexes->getMBBStartIdx(MBB); - DEBUG(dbgs() << Begin << "\tBB#" << MBB->getNumber()); - for (const auto &LI : MBB->liveins()) { + SlotIndex Begin = Indexes->getMBBStartIdx(&MBB); + DEBUG(dbgs() << Begin << "\tBB#" << MBB.getNumber()); + for (const auto &LI : MBB.liveins()) { for (MCRegUnitIterator Units(LI.PhysReg, TRI); Units.isValid(); ++Units) { unsigned Unit = *Units; LiveRange *LR = RegUnitRanges[Unit]; @@ -324,16 +339,13 @@ void LiveIntervals::computeLiveInRegUnits() { DEBUG(dbgs() << "Created " << NewRanges.size() << " new intervals.\n"); // Compute the 'normal' part of the ranges. - for (unsigned i = 0, e = NewRanges.size(); i != e; ++i) { - unsigned Unit = NewRanges[i]; + for (unsigned Unit : NewRanges) computeRegUnitRange(*RegUnitRanges[Unit], Unit); - } } - static void createSegmentsForValues(LiveRange &LR, - iterator_range<LiveInterval::vni_iterator> VNIs) { - for (auto VNI : VNIs) { + iterator_range<LiveInterval::vni_iterator> VNIs) { + for (VNInfo *VNI : VNIs) { if (VNI->isUnused()) continue; SlotIndex Def = VNI->def; @@ -341,7 +353,7 @@ static void createSegmentsForValues(LiveRange &LR, } } -typedef SmallVector<std::pair<SlotIndex, VNInfo*>, 16> ShrinkToUsesWorkList; +using ShrinkToUsesWorkList = SmallVector<std::pair<SlotIndex, VNInfo*>, 16>; static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes, ShrinkToUsesWorkList &WorkList, @@ -349,7 +361,7 @@ static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes, // Keep track of the PHIs that are in use. SmallPtrSet<VNInfo*, 8> UsedPHIs; // Blocks that have already been added to WorkList as live-out. - SmallPtrSet<MachineBasicBlock*, 16> LiveOut; + SmallPtrSet<const MachineBasicBlock*, 16> LiveOut; // Extend intervals to reach all uses in WorkList. while (!WorkList.empty()) { @@ -368,7 +380,7 @@ static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes, !UsedPHIs.insert(VNI).second) continue; // The PHI is live, make sure the predecessors are live-out. - for (auto &Pred : MBB->predecessors()) { + for (const MachineBasicBlock *Pred : MBB->predecessors()) { if (!LiveOut.insert(Pred).second) continue; SlotIndex Stop = Indexes.getMBBEndIdx(Pred); @@ -384,7 +396,7 @@ static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes, LR.addSegment(LiveRange::Segment(BlockStart, Idx, VNI)); // Make sure VNI is live-out from the predecessors. - for (auto &Pred : MBB->predecessors()) { + for (const MachineBasicBlock *Pred : MBB->predecessors()) { if (!LiveOut.insert(Pred).second) continue; SlotIndex Stop = Indexes.getMBBEndIdx(Pred); @@ -415,22 +427,20 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li, ShrinkToUsesWorkList WorkList; // Visit all instructions reading li->reg. - for (MachineRegisterInfo::reg_instr_iterator - I = MRI->reg_instr_begin(li->reg), E = MRI->reg_instr_end(); - I != E; ) { - MachineInstr *UseMI = &*(I++); - if (UseMI->isDebugValue() || !UseMI->readsVirtualRegister(li->reg)) + unsigned Reg = li->reg; + for (MachineInstr &UseMI : MRI->reg_instructions(Reg)) { + if (UseMI.isDebugValue() || !UseMI.readsVirtualRegister(Reg)) continue; - SlotIndex Idx = getInstructionIndex(*UseMI).getRegSlot(); + SlotIndex Idx = getInstructionIndex(UseMI).getRegSlot(); LiveQueryResult LRQ = li->Query(Idx); VNInfo *VNI = LRQ.valueIn(); if (!VNI) { // This shouldn't happen: readsVirtualRegister returns true, but there is // no live value. It is likely caused by a target getting <undef> flags // wrong. - DEBUG(dbgs() << Idx << '\t' << *UseMI + DEBUG(dbgs() << Idx << '\t' << UseMI << "Warning: Instr claims to read non-existent value in " - << *li << '\n'); + << *li << '\n'); continue; } // Special case: An early-clobber tied operand reads and writes the @@ -458,7 +468,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li, bool LiveIntervals::computeDeadValues(LiveInterval &LI, SmallVectorImpl<MachineInstr*> *dead) { bool MayHaveSplitComponents = false; - for (auto VNI : LI.valnos) { + for (VNInfo *VNI : LI.valnos) { if (VNI->isUnused()) continue; SlotIndex Def = VNI->def; @@ -548,7 +558,7 @@ void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg) { SR.segments.swap(NewLR.segments); // Remove dead PHI value numbers - for (auto VNI : SR.valnos) { + for (VNInfo *VNI : SR.valnos) { if (VNI->isUnused()) continue; const LiveRange::Segment *Segment = SR.getSegmentContaining(VNI->def); @@ -571,8 +581,8 @@ void LiveIntervals::extendToIndices(LiveRange &LR, ArrayRef<SlotIndex> Undefs) { assert(LRCalc && "LRCalc not initialized."); LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator()); - for (unsigned i = 0, e = Indices.size(); i != e; ++i) - LRCalc->extend(LR, Indices[i], /*PhysReg=*/0, Undefs); + for (SlotIndex Idx : Indices) + LRCalc->extend(LR, Idx, /*PhysReg=*/0, Undefs); } void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill, @@ -599,13 +609,11 @@ void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill, // Find all blocks that are reachable from KillMBB without leaving VNI's live // range. It is possible that KillMBB itself is reachable, so start a DFS // from each successor. - typedef df_iterator_default_set<MachineBasicBlock*,9> VisitedTy; + using VisitedTy = df_iterator_default_set<MachineBasicBlock*,9>; VisitedTy Visited; - for (MachineBasicBlock::succ_iterator - SuccI = KillMBB->succ_begin(), SuccE = KillMBB->succ_end(); - SuccI != SuccE; ++SuccI) { + for (MachineBasicBlock *Succ : KillMBB->successors()) { for (df_ext_iterator<MachineBasicBlock*, VisitedTy> - I = df_ext_begin(*SuccI, Visited), E = df_ext_end(*SuccI, Visited); + I = df_ext_begin(Succ, Visited), E = df_ext_end(Succ, Visited); I != E;) { MachineBasicBlock *MBB = *I; @@ -657,9 +665,9 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) { // Find the regunit intervals for the assigned register. They may overlap // the virtual register live range, cancelling any kills. RU.clear(); - for (MCRegUnitIterator Units(VRM->getPhys(Reg), TRI); Units.isValid(); - ++Units) { - const LiveRange &RURange = getRegUnit(*Units); + for (MCRegUnitIterator Unit(VRM->getPhys(Reg), TRI); Unit.isValid(); + ++Unit) { + const LiveRange &RURange = getRegUnit(*Unit); if (RURange.empty()) continue; RU.push_back(std::make_pair(&RURange, RURange.find(LI.begin()->end))); @@ -802,9 +810,8 @@ LiveIntervals::hasPHIKill(const LiveInterval &LI, const VNInfo *VNI) const { // Conservatively return true instead of scanning huge predecessor lists. if (PHIMBB->pred_size() > 100) return true; - for (MachineBasicBlock::const_pred_iterator - PI = PHIMBB->pred_begin(), PE = PHIMBB->pred_end(); PI != PE; ++PI) - if (VNI == LI.getVNInfoBefore(Indexes->getMBBEndIdx(*PI))) + for (const MachineBasicBlock *Pred : PHIMBB->predecessors()) + if (VNI == LI.getVNInfoBefore(Indexes->getMBBEndIdx(Pred))) return true; } return false; @@ -831,7 +838,6 @@ LiveIntervals::addSegmentToEndOfBlock(unsigned reg, MachineInstr &startInst) { return S; } - //===----------------------------------------------------------------------===// // Register mask functions //===----------------------------------------------------------------------===// @@ -864,7 +870,7 @@ bool LiveIntervals::checkRegMaskInterference(LiveInterval &LI, return false; bool Found = false; - for (;;) { + while (true) { assert(*SlotI >= LiveI->start); // Loop over all slots overlapping this segment. while (*SlotI < LiveI->end) { @@ -895,7 +901,7 @@ bool LiveIntervals::checkRegMaskInterference(LiveInterval &LI, // IntervalUpdate class. //===----------------------------------------------------------------------===// -// HMEditor is a toolkit used by handleMove to trim or extend live intervals. +/// Toolkit used by handleMove to trim or extend live intervals. class LiveIntervals::HMEditor { private: LiveIntervals& LIS; @@ -1241,10 +1247,12 @@ private: LiveRange::iterator NewIdxIn = NewIdxOut; assert(NewIdxIn == LR.find(NewIdx.getBaseIndex())); const SlotIndex SplitPos = NewIdxDef; + OldIdxVNI = OldIdxIn->valno; // Merge the OldIdxIn and OldIdxOut segments into OldIdxOut. + OldIdxOut->valno->def = OldIdxIn->start; *OldIdxOut = LiveRange::Segment(OldIdxIn->start, OldIdxOut->end, - OldIdxIn->valno); + OldIdxOut->valno); // OldIdxIn and OldIdxVNI are now undef and can be overridden. // We Slide [NewIdxIn, OldIdxIn) down one position. // |- X0/NewIdxIn -| ... |- Xn-1 -||- Xn/OldIdxIn -||- OldIdxOut -| @@ -1514,8 +1522,7 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB, } } - for (unsigned i = 0, e = OrigRegs.size(); i != e; ++i) { - unsigned Reg = OrigRegs[i]; + for (unsigned Reg : OrigRegs) { if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue; @@ -1524,16 +1531,16 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB, if (!LI.hasAtLeastOneValue()) continue; - for (LiveInterval::SubRange &S : LI.subranges()) { + for (LiveInterval::SubRange &S : LI.subranges()) repairOldRegInRange(Begin, End, endIdx, S, Reg, S.LaneMask); - } + repairOldRegInRange(Begin, End, endIdx, LI, Reg); } } void LiveIntervals::removePhysRegDefAt(unsigned Reg, SlotIndex Pos) { - for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) { - if (LiveRange *LR = getCachedRegUnit(*Units)) + for (MCRegUnitIterator Unit(Reg, TRI); Unit.isValid(); ++Unit) { + if (LiveRange *LR = getCachedRegUnit(*Unit)) if (VNInfo *VNI = LR->getVNInfoAt(Pos)) LR->removeValNo(VNI); } diff --git a/contrib/llvm/lib/CodeGen/LiveIntervalUnion.cpp b/contrib/llvm/lib/CodeGen/LiveIntervalUnion.cpp index fc2f233..b3248e5 100644 --- a/contrib/llvm/lib/CodeGen/LiveIntervalUnion.cpp +++ b/contrib/llvm/lib/CodeGen/LiveIntervalUnion.cpp @@ -1,4 +1,4 @@ -//===-- LiveIntervalUnion.cpp - Live interval union data structure --------===// +//===- LiveIntervalUnion.cpp - Live interval union data structure ---------===// // // The LLVM Compiler Infrastructure // @@ -16,16 +16,16 @@ #include "llvm/CodeGen/LiveIntervalUnion.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SparseBitVector.h" -#include "llvm/Support/Debug.h" +#include "llvm/CodeGen/LiveInterval.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetRegisterInfo.h" -#include <algorithm> +#include <cassert> +#include <cstdlib> using namespace llvm; #define DEBUG_TYPE "regalloc" - // Merge a LiveInterval's segments. Guarantee no overlaps. void LiveIntervalUnion::unify(LiveInterval &VirtReg, const LiveRange &Range) { if (Range.empty()) @@ -64,7 +64,7 @@ void LiveIntervalUnion::extract(LiveInterval &VirtReg, const LiveRange &Range) { LiveRange::const_iterator RegEnd = Range.end(); SegmentIter SegPos = Segments.find(RegPos->start); - for (;;) { + while (true) { assert(SegPos.value() == &VirtReg && "Inconsistent LiveInterval"); SegPos.erase(); if (!SegPos.valid()) @@ -126,25 +126,24 @@ collectInterferingVRegs(unsigned MaxInterferingRegs) { CheckedFirstInterference = true; // Quickly skip interference check for empty sets. - if (VirtReg->empty() || LiveUnion->empty()) { + if (LR->empty() || LiveUnion->empty()) { SeenAllInterferences = true; return 0; } - // In most cases, the union will start before VirtReg. - VirtRegI = VirtReg->begin(); + // In most cases, the union will start before LR. + LRI = LR->begin(); LiveUnionI.setMap(LiveUnion->getMap()); - LiveUnionI.find(VirtRegI->start); + LiveUnionI.find(LRI->start); } - LiveInterval::iterator VirtRegEnd = VirtReg->end(); + LiveRange::const_iterator LREnd = LR->end(); LiveInterval *RecentReg = nullptr; while (LiveUnionI.valid()) { - assert(VirtRegI != VirtRegEnd && "Reached end of VirtReg"); + assert(LRI != LREnd && "Reached end of LR"); // Check for overlapping interference. - while (VirtRegI->start < LiveUnionI.stop() && - VirtRegI->end > LiveUnionI.start()) { + while (LRI->start < LiveUnionI.stop() && LRI->end > LiveUnionI.start()) { // This is an overlap, record the interfering register. LiveInterval *VReg = LiveUnionI.value(); if (VReg != RecentReg && !isSeenInterference(VReg)) { @@ -161,20 +160,20 @@ collectInterferingVRegs(unsigned MaxInterferingRegs) { } // The iterators are now not overlapping, LiveUnionI has been advanced - // beyond VirtRegI. - assert(VirtRegI->end <= LiveUnionI.start() && "Expected non-overlap"); + // beyond LRI. + assert(LRI->end <= LiveUnionI.start() && "Expected non-overlap"); // Advance the iterator that ends first. - VirtRegI = VirtReg->advanceTo(VirtRegI, LiveUnionI.start()); - if (VirtRegI == VirtRegEnd) + LRI = LR->advanceTo(LRI, LiveUnionI.start()); + if (LRI == LREnd) break; // Detect overlap, handle above. - if (VirtRegI->start < LiveUnionI.stop()) + if (LRI->start < LiveUnionI.stop()) continue; // Still not overlapping. Catch up LiveUnionI. - LiveUnionI.advanceTo(VirtRegI->start); + LiveUnionI.advanceTo(LRI->start); } SeenAllInterferences = true; return InterferingVRegs.size(); diff --git a/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp b/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp index dcc41c1..cde6ccd 100644 --- a/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp +++ b/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp @@ -53,7 +53,7 @@ void LivePhysRegs::stepBackward(const MachineInstr &MI) { continue; removeReg(Reg); } else if (O->isRegMask()) - removeRegsInMask(*O, nullptr); + removeRegsInMask(*O); } // Add uses to the set. @@ -120,12 +120,11 @@ void LivePhysRegs::print(raw_ostream &OS) const { OS << "\n"; } -/// Dumps the currently live registers to the debug output. -LLVM_DUMP_METHOD void LivePhysRegs::dump() const { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LivePhysRegs::dump() const { dbgs() << " " << *this; -#endif } +#endif bool LivePhysRegs::available(const MachineRegisterInfo &MRI, unsigned Reg) const { @@ -143,63 +142,84 @@ bool LivePhysRegs::available(const MachineRegisterInfo &MRI, /// Add live-in registers of basic block \p MBB to \p LiveRegs. void LivePhysRegs::addBlockLiveIns(const MachineBasicBlock &MBB) { for (const auto &LI : MBB.liveins()) { - MCSubRegIndexIterator S(LI.PhysReg, TRI); - if (LI.LaneMask.all() || (LI.LaneMask.any() && !S.isValid())) { - addReg(LI.PhysReg); + unsigned Reg = LI.PhysReg; + LaneBitmask Mask = LI.LaneMask; + MCSubRegIndexIterator S(Reg, TRI); + assert(Mask.any() && "Invalid livein mask"); + if (Mask.all() || !S.isValid()) { + addReg(Reg); continue; } for (; S.isValid(); ++S) { unsigned SI = S.getSubRegIndex(); - if ((LI.LaneMask & TRI->getSubRegIndexLaneMask(SI)).any()) + if ((Mask & TRI->getSubRegIndexLaneMask(SI)).any()) addReg(S.getSubReg()); } } } -/// Add pristine registers to the given \p LiveRegs. This function removes -/// actually saved callee save registers when \p InPrologueEpilogue is false. -static void addPristines(LivePhysRegs &LiveRegs, const MachineFunction &MF, - const MachineFrameInfo &MFI, - const TargetRegisterInfo &TRI) { - for (const MCPhysReg *CSR = TRI.getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR) +/// Adds all callee saved registers to \p LiveRegs. +static void addCalleeSavedRegs(LivePhysRegs &LiveRegs, + const MachineFunction &MF) { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR; ++CSR) LiveRegs.addReg(*CSR); +} + +/// Adds pristine registers to the given \p LiveRegs. Pristine registers are +/// callee saved registers that are unused in the function. +static void addPristines(LivePhysRegs &LiveRegs, const MachineFunction &MF) { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (!MFI.isCalleeSavedInfoValid()) + return; + /// Add all callee saved regs, then remove the ones that are saved+restored. + addCalleeSavedRegs(LiveRegs, MF); + /// Remove the ones that are not saved/restored; they are pristine. for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) LiveRegs.removeReg(Info.getReg()); } void LivePhysRegs::addLiveOutsNoPristines(const MachineBasicBlock &MBB) { - // To get the live-outs we simply merge the live-ins of all successors. - for (const MachineBasicBlock *Succ : MBB.successors()) - addBlockLiveIns(*Succ); + if (!MBB.succ_empty()) { + // To get the live-outs we simply merge the live-ins of all successors. + for (const MachineBasicBlock *Succ : MBB.successors()) + addBlockLiveIns(*Succ); + } else if (MBB.isReturnBlock()) { + // For the return block: Add all callee saved registers that are saved and + // restored (somewhere); This does not include callee saved registers that + // are unused and hence not saved and restored; they are called pristine. + const MachineFunction &MF = *MBB.getParent(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (MFI.isCalleeSavedInfoValid()) { + for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) + addReg(Info.getReg()); + } + } } void LivePhysRegs::addLiveOuts(const MachineBasicBlock &MBB) { const MachineFunction &MF = *MBB.getParent(); - const MachineFrameInfo &MFI = MF.getFrameInfo(); - if (MFI.isCalleeSavedInfoValid()) { - if (MBB.isReturnBlock()) { - // The return block has no successors whose live-ins we could merge - // below. So instead we add the callee saved registers manually. - for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) - addReg(*I); - } else { - addPristines(*this, MF, MFI, *TRI); - } + if (!MBB.succ_empty()) { + addPristines(*this, MF); + addLiveOutsNoPristines(MBB); + } else if (MBB.isReturnBlock()) { + // For the return block: Add all callee saved registers. + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (MFI.isCalleeSavedInfoValid()) + addCalleeSavedRegs(*this, MF); } - - addLiveOutsNoPristines(MBB); } void LivePhysRegs::addLiveIns(const MachineBasicBlock &MBB) { const MachineFunction &MF = *MBB.getParent(); - const MachineFrameInfo &MFI = MF.getFrameInfo(); - if (MFI.isCalleeSavedInfoValid()) - addPristines(*this, MF, MFI, *TRI); + addPristines(*this, MF); addBlockLiveIns(MBB); } -void llvm::computeLiveIns(LivePhysRegs &LiveRegs, const TargetRegisterInfo &TRI, +void llvm::computeLiveIns(LivePhysRegs &LiveRegs, + const MachineRegisterInfo &MRI, MachineBasicBlock &MBB) { + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); assert(MBB.livein_empty()); LiveRegs.init(TRI); LiveRegs.addLiveOutsNoPristines(MBB); @@ -207,10 +227,12 @@ void llvm::computeLiveIns(LivePhysRegs &LiveRegs, const TargetRegisterInfo &TRI, LiveRegs.stepBackward(MI); for (unsigned Reg : LiveRegs) { + if (MRI.isReserved(Reg)) + continue; // Skip the register if we are about to add one of its super registers. bool ContainsSuperReg = false; for (MCSuperRegIterator SReg(Reg, &TRI); SReg.isValid(); ++SReg) { - if (LiveRegs.contains(*SReg)) { + if (LiveRegs.contains(*SReg) && !MRI.isReserved(*SReg)) { ContainsSuperReg = true; break; } diff --git a/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp b/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp index 0128376..8c43c9f 100644 --- a/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp +++ b/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp @@ -20,11 +20,14 @@ using namespace llvm; #define DEBUG_TYPE "regalloc" +// Reserve an address that indicates a value that is known to be "undef". +static VNInfo UndefVNI(0xbad, SlotIndex()); + void LiveRangeCalc::resetLiveOutMap() { unsigned NumBlocks = MF->getNumBlockIDs(); Seen.clear(); Seen.resize(NumBlocks); - EntryInfoMap.clear(); + EntryInfos.clear(); Map.resize(NumBlocks); } @@ -75,34 +78,11 @@ void LiveRangeCalc::calculate(LiveInterval &LI, bool TrackSubRegs) { LI.createSubRangeFrom(*Alloc, ClassMask, LI); } - LaneBitmask Mask = SubMask; - for (LiveInterval::SubRange &S : LI.subranges()) { - // A Mask for subregs common to the existing subrange and current def. - LaneBitmask Common = S.LaneMask & Mask; - if (Common.none()) - continue; - LiveInterval::SubRange *CommonRange; - // A Mask for subregs covered by the subrange but not the current def. - LaneBitmask RM = S.LaneMask & ~Mask; - if (RM.any()) { - // Split the subrange S into two parts: one covered by the current - // def (CommonRange), and the one not affected by it (updated S). - S.LaneMask = RM; - CommonRange = LI.createSubRangeFrom(*Alloc, Common, S); - } else { - assert(Common == S.LaneMask); - CommonRange = &S; - } + LI.refineSubRanges(*Alloc, SubMask, + [&MO, this](LiveInterval::SubRange &SR) { if (MO.isDef()) - createDeadDef(*Indexes, *Alloc, *CommonRange, MO); - Mask &= ~Common; - } - // Create a new SubRange for subregs we did not cover yet. - if (Mask.any()) { - LiveInterval::SubRange *NewRange = LI.createSubRange(*Alloc, Mask); - if (MO.isDef()) - createDeadDef(*Indexes, *Alloc, *NewRange, MO); - } + createDeadDef(*Indexes, *Alloc, SR, MO); + }); } // Create the def in the main liverange. We do not have to do this if @@ -289,8 +269,7 @@ bool LiveRangeCalc::isDefOnEntry(LiveRange &LR, ArrayRef<SlotIndex> Undefs, if (UndefOnEntry[BN]) return false; - auto MarkDefined = - [this,BN,&DefOnEntry,&UndefOnEntry] (MachineBasicBlock &B) -> bool { + auto MarkDefined = [BN, &DefOnEntry](MachineBasicBlock &B) -> bool { for (MachineBasicBlock *S : B.successors()) DefOnEntry[S->getNumber()] = true; DefOnEntry[BN] = true; @@ -307,11 +286,19 @@ bool LiveRangeCalc::isDefOnEntry(LiveRange &LR, ArrayRef<SlotIndex> Undefs, // Determine if the exit from the block is reached by some def. unsigned N = WorkList[i]; MachineBasicBlock &B = *MF->getBlockNumbered(N); - if (Seen[N] && Map[&B].first != nullptr) - return MarkDefined(B); + if (Seen[N]) { + const LiveOutPair &LOB = Map[&B]; + if (LOB.first != nullptr && LOB.first != &UndefVNI) + return MarkDefined(B); + } SlotIndex Begin, End; std::tie(Begin, End) = Indexes->getMBBRange(&B); - LiveRange::iterator UB = std::upper_bound(LR.begin(), LR.end(), End); + // Treat End as not belonging to B. + // If LR has a segment S that starts at the next block, i.e. [End, ...), + // std::upper_bound will return the segment following S. Instead, + // S should be treated as the first segment that does not overlap B. + LiveRange::iterator UB = std::upper_bound(LR.begin(), LR.end(), + End.getPrevSlot()); if (UB != LR.begin()) { LiveRange::Segment &Seg = *std::prev(UB); if (Seg.end > Begin) { @@ -384,10 +371,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB, #endif FoundUndef |= MBB->pred_empty(); - for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(), - PE = MBB->pred_end(); PI != PE; ++PI) { - MachineBasicBlock *Pred = *PI; - + for (MachineBasicBlock *Pred : MBB->predecessors()) { // Is this a known live-out block? if (Seen.test(Pred->getNumber())) { if (VNInfo *VNI = Map[Pred].first) { @@ -406,7 +390,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB, auto EP = LR.extendInBlock(Undefs, Start, End); VNInfo *VNI = EP.first; FoundUndef |= EP.second; - setLiveOutValue(Pred, VNI); + setLiveOutValue(Pred, EP.second ? &UndefVNI : VNI); if (VNI) { if (TheVNI && TheVNI != VNI) UniqueVNI = false; @@ -425,7 +409,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB, } LiveIn.clear(); - FoundUndef |= (TheVNI == nullptr); + FoundUndef |= (TheVNI == nullptr || TheVNI == &UndefVNI); if (Undefs.size() > 0 && FoundUndef) UniqueVNI = false; @@ -436,7 +420,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB, // If a unique reaching def was found, blit in the live ranges immediately. if (UniqueVNI) { - assert(TheVNI != nullptr); + assert(TheVNI != nullptr && TheVNI != &UndefVNI); LiveRangeUpdater Updater(&LR); for (unsigned BN : WorkList) { SlotIndex Start, End; @@ -452,22 +436,26 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB, } // Prepare the defined/undefined bit vectors. - auto EF = EntryInfoMap.find(&LR); - if (EF == EntryInfoMap.end()) { + EntryInfoMap::iterator Entry; + bool DidInsert; + std::tie(Entry, DidInsert) = EntryInfos.insert( + std::make_pair(&LR, std::make_pair(BitVector(), BitVector()))); + if (DidInsert) { + // Initialize newly inserted entries. unsigned N = MF->getNumBlockIDs(); - EF = EntryInfoMap.insert({&LR, {BitVector(), BitVector()}}).first; - EF->second.first.resize(N); - EF->second.second.resize(N); + Entry->second.first.resize(N); + Entry->second.second.resize(N); } - BitVector &DefOnEntry = EF->second.first; - BitVector &UndefOnEntry = EF->second.second; + BitVector &DefOnEntry = Entry->second.first; + BitVector &UndefOnEntry = Entry->second.second; // Multiple values were found, so transfer the work list to the LiveIn array // where UpdateSSA will use it as a work list. LiveIn.reserve(WorkList.size()); for (unsigned BN : WorkList) { MachineBasicBlock *MBB = MF->getBlockNumbered(BN); - if (Undefs.size() > 0 && !isDefOnEntry(LR, Undefs, *MBB, DefOnEntry, UndefOnEntry)) + if (Undefs.size() > 0 && + !isDefOnEntry(LR, Undefs, *MBB, DefOnEntry, UndefOnEntry)) continue; addLiveInBlock(LR, DomTree->getNode(MBB)); if (MBB == &UseMBB) @@ -485,9 +473,9 @@ void LiveRangeCalc::updateSSA() { assert(DomTree && "Missing dominator tree"); // Interate until convergence. - unsigned Changes; + bool Changed; do { - Changes = 0; + Changed = false; // Propagate live-out values down the dominator tree, inserting phi-defs // when necessary. for (LiveInBlock &I : LiveIn) { @@ -510,15 +498,20 @@ void LiveRangeCalc::updateSSA() { IDomValue = Map[IDom->getBlock()]; // Cache the DomTree node that defined the value. - if (IDomValue.first && !IDomValue.second) + if (IDomValue.first && IDomValue.first != &UndefVNI && + !IDomValue.second) { Map[IDom->getBlock()].second = IDomValue.second = DomTree->getNode(Indexes->getMBBFromIndex(IDomValue.first->def)); + } - for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(), - PE = MBB->pred_end(); PI != PE; ++PI) { - LiveOutPair &Value = Map[*PI]; + for (MachineBasicBlock *Pred : MBB->predecessors()) { + LiveOutPair &Value = Map[Pred]; if (!Value.first || Value.first == IDomValue.first) continue; + if (Value.first == &UndefVNI) { + needPHI = true; + break; + } // Cache the DomTree node that defined the value. if (!Value.second) @@ -542,7 +535,7 @@ void LiveRangeCalc::updateSSA() { // Create a phi-def if required. if (needPHI) { - ++Changes; + Changed = true; assert(Alloc && "Need VNInfo allocator to create PHI-defs"); SlotIndex Start, End; std::tie(Start, End) = Indexes->getMBBRange(MBB); @@ -561,7 +554,7 @@ void LiveRangeCalc::updateSSA() { LR.addSegment(LiveInterval::Segment(Start, End, VNI)); LOP = LiveOutPair(VNI, Node); } - } else if (IDomValue.first) { + } else if (IDomValue.first && IDomValue.first != &UndefVNI) { // No phi-def here. Remember incoming value. I.Value = IDomValue.first; @@ -573,9 +566,9 @@ void LiveRangeCalc::updateSSA() { // MBB is live-out and doesn't define its own value. if (LOP.first == IDomValue.first) continue; - ++Changes; + Changed = true; LOP = IDomValue; } } - } while (Changes); + } while (Changed); } diff --git a/contrib/llvm/lib/CodeGen/LiveRangeCalc.h b/contrib/llvm/lib/CodeGen/LiveRangeCalc.h index 1a7598f..d41b782 100644 --- a/contrib/llvm/lib/CodeGen/LiveRangeCalc.h +++ b/contrib/llvm/lib/CodeGen/LiveRangeCalc.h @@ -24,6 +24,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/IndexedMap.h" #include "llvm/CodeGen/LiveInterval.h" @@ -65,7 +66,8 @@ class LiveRangeCalc { /// registers do not overlap), but the defined/undefined information must /// be kept separate for each individual range. /// By convention, EntryInfoMap[&LR] = { Defined, Undefined }. - std::map<LiveRange*,std::pair<BitVector,BitVector>> EntryInfoMap; + typedef DenseMap<LiveRange*,std::pair<BitVector,BitVector>> EntryInfoMap; + EntryInfoMap EntryInfos; /// Map each basic block where a live range is live out to the live-out value /// and its defining block. diff --git a/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp b/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp index 7f1c69c..92cca1a 100644 --- a/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp +++ b/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp @@ -37,6 +37,8 @@ LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(unsigned OldReg) { VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg)); } LiveInterval &LI = LIS.createEmptyInterval(VReg); + if (Parent && !Parent->isSpillable()) + LI.markNotSpillable(); // Create empty subranges if the OldReg's interval has them. Do not create // the main range here---it will be constructed later after the subranges // have been finalized. @@ -52,6 +54,14 @@ unsigned LiveRangeEdit::createFrom(unsigned OldReg) { if (VRM) { VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg)); } + // FIXME: Getting the interval here actually computes it. + // In theory, this may not be what we want, but in practice + // the createEmptyIntervalFrom API is used when this is not + // the case. Generally speaking we just want to annotate the + // LiveInterval when it gets created but we cannot do that at + // the moment. + if (Parent && !Parent->isSpillable()) + LIS.getInterval(VReg).markNotSpillable(); return VReg; } @@ -442,9 +452,6 @@ LiveRangeEdit::MRI_NoteNewVirtualRegister(unsigned VReg) if (VRM) VRM->grow(); - if (Parent && !Parent->isSpillable()) - LIS.getInterval(VReg).markNotSpillable(); - NewRegs.push_back(VReg); } diff --git a/contrib/llvm/lib/CodeGen/LiveRangeShrink.cpp b/contrib/llvm/lib/CodeGen/LiveRangeShrink.cpp new file mode 100644 index 0000000..552f4b5 --- /dev/null +++ b/contrib/llvm/lib/CodeGen/LiveRangeShrink.cpp @@ -0,0 +1,231 @@ +//===-- LiveRangeShrink.cpp - Move instructions to shrink live range ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +///===---------------------------------------------------------------------===// +/// +/// \file +/// This pass moves instructions close to the definition of its operands to +/// shrink live range of the def instruction. The code motion is limited within +/// the basic block. The moved instruction should have 1 def, and more than one +/// uses, all of which are the only use of the def. +/// +///===---------------------------------------------------------------------===// +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "lrshrink" + +STATISTIC(NumInstrsHoistedToShrinkLiveRange, + "Number of insructions hoisted to shrink live range."); + +using namespace llvm; + +namespace { +class LiveRangeShrink : public MachineFunctionPass { +public: + static char ID; + + LiveRangeShrink() : MachineFunctionPass(ID) { + initializeLiveRangeShrinkPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { return "Live Range Shrink"; } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; +} // End anonymous namespace. + +char LiveRangeShrink::ID = 0; +char &llvm::LiveRangeShrinkID = LiveRangeShrink::ID; + +INITIALIZE_PASS(LiveRangeShrink, "lrshrink", "Live Range Shrink Pass", false, + false) +namespace { +typedef DenseMap<MachineInstr *, unsigned> InstOrderMap; + +/// Returns \p New if it's dominated by \p Old, otherwise return \p Old. +/// \p M maintains a map from instruction to its dominating order that satisfies +/// M[A] > M[B] guarantees that A is dominated by B. +/// If \p New is not in \p M, return \p Old. Otherwise if \p Old is null, return +/// \p New. +MachineInstr *FindDominatedInstruction(MachineInstr &New, MachineInstr *Old, + const InstOrderMap &M) { + auto NewIter = M.find(&New); + if (NewIter == M.end()) + return Old; + if (Old == nullptr) + return &New; + unsigned OrderOld = M.find(Old)->second; + unsigned OrderNew = NewIter->second; + if (OrderOld != OrderNew) + return OrderOld < OrderNew ? &New : Old; + // OrderOld == OrderNew, we need to iterate down from Old to see if it + // can reach New, if yes, New is dominated by Old. + for (MachineInstr *I = Old->getNextNode(); M.find(I)->second == OrderNew; + I = I->getNextNode()) + if (I == &New) + return &New; + return Old; +} + +/// Builds Instruction to its dominating order number map \p M by traversing +/// from instruction \p Start. +void BuildInstOrderMap(MachineBasicBlock::iterator Start, InstOrderMap &M) { + M.clear(); + unsigned i = 0; + for (MachineInstr &I : make_range(Start, Start->getParent()->end())) + M[&I] = i++; +} +} // end anonymous namespace + +bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + + MachineRegisterInfo &MRI = MF.getRegInfo(); + + DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n'); + + InstOrderMap IOM; + // Map from register to instruction order (value of IOM) where the + // register is used last. When moving instructions up, we need to + // make sure all its defs (including dead def) will not cross its + // last use when moving up. + DenseMap<unsigned, std::pair<unsigned, MachineInstr *>> UseMap; + + for (MachineBasicBlock &MBB : MF) { + if (MBB.empty()) + continue; + bool SawStore = false; + BuildInstOrderMap(MBB.begin(), IOM); + UseMap.clear(); + + for (MachineBasicBlock::iterator Next = MBB.begin(); Next != MBB.end();) { + MachineInstr &MI = *Next; + ++Next; + if (MI.isPHI() || MI.isDebugValue()) + continue; + if (MI.mayStore()) + SawStore = true; + + unsigned CurrentOrder = IOM[&MI]; + unsigned Barrier = 0; + MachineInstr *BarrierMI = nullptr; + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || MO.isDebug()) + continue; + if (MO.isUse()) + UseMap[MO.getReg()] = std::make_pair(CurrentOrder, &MI); + else if (MO.isDead() && UseMap.count(MO.getReg())) + // Barrier is the last instruction where MO get used. MI should not + // be moved above Barrier. + if (Barrier < UseMap[MO.getReg()].first) { + Barrier = UseMap[MO.getReg()].first; + BarrierMI = UseMap[MO.getReg()].second; + } + } + + if (!MI.isSafeToMove(nullptr, SawStore)) { + // If MI has side effects, it should become a barrier for code motion. + // IOM is rebuild from the next instruction to prevent later + // instructions from being moved before this MI. + if (MI.hasUnmodeledSideEffects() && Next != MBB.end()) { + BuildInstOrderMap(Next, IOM); + SawStore = false; + } + continue; + } + + const MachineOperand *DefMO = nullptr; + MachineInstr *Insert = nullptr; + + // Number of live-ranges that will be shortened. We do not count + // live-ranges that are defined by a COPY as it could be coalesced later. + unsigned NumEligibleUse = 0; + + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || MO.isDead() || MO.isDebug()) + continue; + unsigned Reg = MO.getReg(); + // Do not move the instruction if it def/uses a physical register, + // unless it is a constant physical register or a noreg. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + if (!Reg || MRI.isConstantPhysReg(Reg)) + continue; + Insert = nullptr; + break; + } + if (MO.isDef()) { + // Do not move if there is more than one def. + if (DefMO) { + Insert = nullptr; + break; + } + DefMO = &MO; + } else if (MRI.hasOneNonDBGUse(Reg) && MRI.hasOneDef(Reg) && DefMO && + MRI.getRegClass(DefMO->getReg()) == + MRI.getRegClass(MO.getReg())) { + // The heuristic does not handle different register classes yet + // (registers of different sizes, looser/tighter constraints). This + // is because it needs more accurate model to handle register + // pressure correctly. + MachineInstr &DefInstr = *MRI.def_instr_begin(Reg); + if (!DefInstr.isCopy()) + NumEligibleUse++; + Insert = FindDominatedInstruction(DefInstr, Insert, IOM); + } else { + Insert = nullptr; + break; + } + } + + // If Barrier equals IOM[I], traverse forward to find if BarrierMI is + // after Insert, if yes, then we should not hoist. + for (MachineInstr *I = Insert; I && IOM[I] == Barrier; + I = I->getNextNode()) + if (I == BarrierMI) { + Insert = nullptr; + break; + } + // Move the instruction when # of shrunk live range > 1. + if (DefMO && Insert && NumEligibleUse > 1 && Barrier <= IOM[Insert]) { + MachineBasicBlock::iterator I = std::next(Insert->getIterator()); + // Skip all the PHI and debug instructions. + while (I != MBB.end() && (I->isPHI() || I->isDebugValue())) + I = std::next(I); + if (I == MI.getIterator()) + continue; + + // Update the dominator order to be the same as the insertion point. + // We do this to maintain a non-decreasing order without need to update + // all instruction orders after the insertion point. + unsigned NewOrder = IOM[&*I]; + IOM[&MI] = NewOrder; + NumInstrsHoistedToShrinkLiveRange++; + + // Find MI's debug value following MI. + MachineBasicBlock::iterator EndIter = std::next(MI.getIterator()); + if (MI.getOperand(0).isReg()) + for (; EndIter != MBB.end() && EndIter->isDebugValue() && + EndIter->getOperand(0).isReg() && + EndIter->getOperand(0).getReg() == MI.getOperand(0).getReg(); + ++EndIter, ++Next) + IOM[&*EndIter] = NewOrder; + MBB.splice(I, &MBB, MI.getIterator(), EndIter); + } + } + } + return false; +} diff --git a/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp b/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp index 7a51386..60033db 100644 --- a/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp +++ b/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp @@ -1,4 +1,4 @@ -//===-- LiveRegMatrix.cpp - Track register interference -------------------===// +//===- LiveRegMatrix.cpp - Track register interference --------------------===// // // The LLVM Compiler Infrastructure // @@ -14,12 +14,19 @@ #include "llvm/CodeGen/LiveRegMatrix.h" #include "RegisterCoalescer.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveIntervalUnion.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <cassert> using namespace llvm; @@ -36,8 +43,7 @@ INITIALIZE_PASS_DEPENDENCY(VirtRegMap) INITIALIZE_PASS_END(LiveRegMatrix, "liveregmatrix", "Live Register Matrix", false, false) -LiveRegMatrix::LiveRegMatrix() : MachineFunctionPass(ID), - UserTag(0), RegMaskTag(0), RegMaskVirtReg(0) {} +LiveRegMatrix::LiveRegMatrix() : MachineFunctionPass(ID) {} void LiveRegMatrix::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); @@ -169,10 +175,10 @@ bool LiveRegMatrix::checkRegUnitInterference(LiveInterval &VirtReg, return Result; } -LiveIntervalUnion::Query &LiveRegMatrix::query(LiveInterval &VirtReg, +LiveIntervalUnion::Query &LiveRegMatrix::query(const LiveRange &LR, unsigned RegUnit) { LiveIntervalUnion::Query &Q = Queries[RegUnit]; - Q.init(UserTag, &VirtReg, &Matrix[RegUnit]); + Q.init(UserTag, LR, Matrix[RegUnit]); return Q; } @@ -190,9 +196,12 @@ LiveRegMatrix::checkInterference(LiveInterval &VirtReg, unsigned PhysReg) { return IK_RegUnit; // Check the matrix for virtual register interference. - for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) - if (query(VirtReg, *Units).checkInterference()) - return IK_VirtReg; + bool Interference = foreachUnit(TRI, VirtReg, PhysReg, + [&](unsigned Unit, const LiveRange &LR) { + return query(LR, Unit).checkInterference(); + }); + if (Interference) + return IK_VirtReg; return IK_Free; } diff --git a/contrib/llvm/lib/CodeGen/LiveRegUnits.cpp b/contrib/llvm/lib/CodeGen/LiveRegUnits.cpp new file mode 100644 index 0000000..f9ba4ff --- /dev/null +++ b/contrib/llvm/lib/CodeGen/LiveRegUnits.cpp @@ -0,0 +1,132 @@ +//===- LiveRegUnits.cpp - Register Unit Set -------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This file imlements the LiveRegUnits set. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/LiveRegUnits.h" + +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBundle.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" + +using namespace llvm; + +void LiveRegUnits::removeRegsNotPreserved(const uint32_t *RegMask) { + for (unsigned U = 0, E = TRI->getNumRegUnits(); U != E; ++U) { + for (MCRegUnitRootIterator RootReg(U, TRI); RootReg.isValid(); ++RootReg) { + if (MachineOperand::clobbersPhysReg(RegMask, *RootReg)) + Units.reset(U); + } + } +} + +void LiveRegUnits::addRegsInMask(const uint32_t *RegMask) { + for (unsigned U = 0, E = TRI->getNumRegUnits(); U != E; ++U) { + for (MCRegUnitRootIterator RootReg(U, TRI); RootReg.isValid(); ++RootReg) { + if (MachineOperand::clobbersPhysReg(RegMask, *RootReg)) + Units.set(U); + } + } +} + +void LiveRegUnits::stepBackward(const MachineInstr &MI) { + // Remove defined registers and regmask kills from the set. + for (ConstMIBundleOperands O(MI); O.isValid(); ++O) { + if (O->isReg()) { + if (!O->isDef()) + continue; + unsigned Reg = O->getReg(); + if (!TargetRegisterInfo::isPhysicalRegister(Reg)) + continue; + removeReg(Reg); + } else if (O->isRegMask()) + removeRegsNotPreserved(O->getRegMask()); + } + + // Add uses to the set. + for (ConstMIBundleOperands O(MI); O.isValid(); ++O) { + if (!O->isReg() || !O->readsReg()) + continue; + unsigned Reg = O->getReg(); + if (!TargetRegisterInfo::isPhysicalRegister(Reg)) + continue; + addReg(Reg); + } +} + +void LiveRegUnits::accumulate(const MachineInstr &MI) { + // Add defs, uses and regmask clobbers to the set. + for (ConstMIBundleOperands O(MI); O.isValid(); ++O) { + if (O->isReg()) { + unsigned Reg = O->getReg(); + if (!TargetRegisterInfo::isPhysicalRegister(Reg)) + continue; + if (!O->isDef() && !O->readsReg()) + continue; + addReg(Reg); + } else if (O->isRegMask()) + addRegsInMask(O->getRegMask()); + } +} + +/// Add live-in registers of basic block \p MBB to \p LiveUnits. +static void addBlockLiveIns(LiveRegUnits &LiveUnits, + const MachineBasicBlock &MBB) { + for (const auto &LI : MBB.liveins()) + LiveUnits.addRegMasked(LI.PhysReg, LI.LaneMask); +} + +/// Adds all callee saved registers to \p LiveUnits. +static void addCalleeSavedRegs(LiveRegUnits &LiveUnits, + const MachineFunction &MF) { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR; ++CSR) + LiveUnits.addReg(*CSR); +} + +/// Adds pristine registers to the given \p LiveUnits. Pristine registers are +/// callee saved registers that are unused in the function. +static void addPristines(LiveRegUnits &LiveUnits, const MachineFunction &MF) { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (!MFI.isCalleeSavedInfoValid()) + return; + /// Add all callee saved regs, then remove the ones that are saved+restored. + addCalleeSavedRegs(LiveUnits, MF); + /// Remove the ones that are not saved/restored; they are pristine. + for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) + LiveUnits.removeReg(Info.getReg()); +} + +void LiveRegUnits::addLiveOuts(const MachineBasicBlock &MBB) { + const MachineFunction &MF = *MBB.getParent(); + if (!MBB.succ_empty()) { + addPristines(*this, MF); + // To get the live-outs we simply merge the live-ins of all successors. + for (const MachineBasicBlock *Succ : MBB.successors()) + addBlockLiveIns(*this, *Succ); + } else if (MBB.isReturnBlock()) { + // For the return block: Add all callee saved registers. + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (MFI.isCalleeSavedInfoValid()) + addCalleeSavedRegs(*this, MF); + } +} + +void LiveRegUnits::addLiveIns(const MachineBasicBlock &MBB) { + const MachineFunction &MF = *MBB.getParent(); + addPristines(*this, MF); + addBlockLiveIns(*this, MBB); +} diff --git a/contrib/llvm/lib/CodeGen/LiveStackAnalysis.cpp b/contrib/llvm/lib/CodeGen/LiveStackAnalysis.cpp index dbf1f96..b51f8b0 100644 --- a/contrib/llvm/lib/CodeGen/LiveStackAnalysis.cpp +++ b/contrib/llvm/lib/CodeGen/LiveStackAnalysis.cpp @@ -25,10 +25,10 @@ using namespace llvm; #define DEBUG_TYPE "livestacks" char LiveStacks::ID = 0; -INITIALIZE_PASS_BEGIN(LiveStacks, "livestacks", +INITIALIZE_PASS_BEGIN(LiveStacks, DEBUG_TYPE, "Live Stack Slot Analysis", false, false) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) -INITIALIZE_PASS_END(LiveStacks, "livestacks", +INITIALIZE_PASS_END(LiveStacks, DEBUG_TYPE, "Live Stack Slot Analysis", false, false) char &llvm::LiveStacksID = LiveStacks::ID; diff --git a/contrib/llvm/lib/CodeGen/LiveVariables.cpp b/contrib/llvm/lib/CodeGen/LiveVariables.cpp index 269b990a31..a9aec92 100644 --- a/contrib/llvm/lib/CodeGen/LiveVariables.cpp +++ b/contrib/llvm/lib/CodeGen/LiveVariables.cpp @@ -64,8 +64,8 @@ LiveVariables::VarInfo::findKill(const MachineBasicBlock *MBB) const { return nullptr; } -LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const { dbgs() << " Alive in blocks: "; for (SparseBitVector<>::iterator I = AliveBlocks.begin(), E = AliveBlocks.end(); I != E; ++I) @@ -78,8 +78,8 @@ LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const { dbgs() << "\n #" << i << ": " << *Kills[i]; dbgs() << "\n"; } -#endif } +#endif /// getVarInfo - Get (possibly creating) a VarInfo object for the given vreg. LiveVariables::VarInfo &LiveVariables::getVarInfo(unsigned RegIdx) { @@ -767,7 +767,7 @@ void LiveVariables::addNewBlock(MachineBasicBlock *BB, MachineBasicBlock *SuccBB) { const unsigned NumNew = BB->getNumber(); - SmallSet<unsigned, 16> Defs, Kills; + DenseSet<unsigned> Defs, Kills; MachineBasicBlock::iterator BBI = SuccBB->begin(), BBE = SuccBB->end(); for (; BBI != BBE && BBI->isPHI(); ++BBI) { diff --git a/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp index e189fb0..b109f19 100644 --- a/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp +++ b/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp @@ -14,7 +14,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallSet.h" @@ -23,6 +22,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/StackProtector.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" @@ -103,10 +103,10 @@ namespace { char LocalStackSlotPass::ID = 0; char &llvm::LocalStackSlotAllocationID = LocalStackSlotPass::ID; -INITIALIZE_PASS_BEGIN(LocalStackSlotPass, "localstackalloc", +INITIALIZE_PASS_BEGIN(LocalStackSlotPass, DEBUG_TYPE, "Local Stack Slot Allocation", false, false) INITIALIZE_PASS_DEPENDENCY(StackProtector) -INITIALIZE_PASS_END(LocalStackSlotPass, "localstackalloc", +INITIALIZE_PASS_END(LocalStackSlotPass, DEBUG_TYPE, "Local Stack Slot Allocation", false, false) diff --git a/contrib/llvm/lib/CodeGen/LowLevelType.cpp b/contrib/llvm/lib/CodeGen/LowLevelType.cpp index d74b730..1c682e7 100644 --- a/contrib/llvm/lib/CodeGen/LowLevelType.cpp +++ b/contrib/llvm/lib/CodeGen/LowLevelType.cpp @@ -1,4 +1,4 @@ -//===-- llvm/CodeGen/GlobalISel/LowLevelType.cpp --------------------------===// +//===-- llvm/CodeGen/LowLevelType.cpp -------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -18,54 +18,21 @@ #include "llvm/Support/raw_ostream.h" using namespace llvm; -LLT::LLT(Type &Ty, const DataLayout &DL) { +LLT llvm::getLLTForType(Type &Ty, const DataLayout &DL) { if (auto VTy = dyn_cast<VectorType>(&Ty)) { - SizeInBits = VTy->getElementType()->getPrimitiveSizeInBits(); - ElementsOrAddrSpace = VTy->getNumElements(); - Kind = ElementsOrAddrSpace == 1 ? Scalar : Vector; + auto NumElements = VTy->getNumElements(); + LLT ScalarTy = getLLTForType(*VTy->getElementType(), DL); + if (NumElements == 1) + return ScalarTy; + return LLT::vector(NumElements, ScalarTy); } else if (auto PTy = dyn_cast<PointerType>(&Ty)) { - Kind = Pointer; - SizeInBits = DL.getTypeSizeInBits(&Ty); - ElementsOrAddrSpace = PTy->getAddressSpace(); + return LLT::pointer(PTy->getAddressSpace(), DL.getTypeSizeInBits(&Ty)); } else if (Ty.isSized()) { // Aggregates are no different from real scalars as far as GlobalISel is // concerned. - Kind = Scalar; - SizeInBits = DL.getTypeSizeInBits(&Ty); - ElementsOrAddrSpace = 1; + auto SizeInBits = DL.getTypeSizeInBits(&Ty); assert(SizeInBits != 0 && "invalid zero-sized type"); - } else { - Kind = Invalid; - SizeInBits = ElementsOrAddrSpace = 0; + return LLT::scalar(SizeInBits); } -} - -LLT::LLT(MVT VT) { - if (VT.isVector()) { - SizeInBits = VT.getVectorElementType().getSizeInBits(); - ElementsOrAddrSpace = VT.getVectorNumElements(); - Kind = ElementsOrAddrSpace == 1 ? Scalar : Vector; - } else if (VT.isValid()) { - // Aggregates are no different from real scalars as far as GlobalISel is - // concerned. - Kind = Scalar; - SizeInBits = VT.getSizeInBits(); - ElementsOrAddrSpace = 1; - assert(SizeInBits != 0 && "invalid zero-sized type"); - } else { - Kind = Invalid; - SizeInBits = ElementsOrAddrSpace = 0; - } -} - -void LLT::print(raw_ostream &OS) const { - if (isVector()) - OS << "<" << ElementsOrAddrSpace << " x s" << SizeInBits << ">"; - else if (isPointer()) - OS << "p" << getAddressSpace(); - else if (isValid()) { - assert(isScalar() && "unexpected type"); - OS << "s" << getScalarSizeInBits(); - } else - llvm_unreachable("trying to print an invalid type"); + return LLT(); } diff --git a/contrib/llvm/lib/CodeGen/LowerEmuTLS.cpp b/contrib/llvm/lib/CodeGen/LowerEmuTLS.cpp index 6966c8c..0fc48d4 100644 --- a/contrib/llvm/lib/CodeGen/LowerEmuTLS.cpp +++ b/contrib/llvm/lib/CodeGen/LowerEmuTLS.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" @@ -28,14 +29,12 @@ using namespace llvm; namespace { class LowerEmuTLS : public ModulePass { - const TargetMachine *TM; public: static char ID; // Pass identification, replacement for typeid - explicit LowerEmuTLS() : ModulePass(ID), TM(nullptr) { } - explicit LowerEmuTLS(const TargetMachine *TM) - : ModulePass(ID), TM(TM) { + LowerEmuTLS() : ModulePass(ID) { initializeLowerEmuTLSPass(*PassRegistry::getPassRegistry()); } + bool runOnModule(Module &M) override; private: bool addEmuTlsVar(Module &M, const GlobalVariable *GV); @@ -54,19 +53,22 @@ private: char LowerEmuTLS::ID = 0; -INITIALIZE_PASS(LowerEmuTLS, "loweremutls", - "Add __emutls_[vt]. variables for emultated TLS model", - false, false) +INITIALIZE_PASS(LowerEmuTLS, DEBUG_TYPE, + "Add __emutls_[vt]. variables for emultated TLS model", false, + false) -ModulePass *llvm::createLowerEmuTLSPass(const TargetMachine *TM) { - return new LowerEmuTLS(TM); -} +ModulePass *llvm::createLowerEmuTLSPass() { return new LowerEmuTLS(); } bool LowerEmuTLS::runOnModule(Module &M) { if (skipModule(M)) return false; - if (!TM || !TM->Options.EmulatedTLS) + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + return false; + + auto &TM = TPC->getTM<TargetMachine>(); + if (!TM.Options.EmulatedTLS) return false; bool Changed = false; diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.cpp index 1f1ce6e..58a655a 100644 --- a/contrib/llvm/lib/CodeGen/MIRParser/MILexer.cpp +++ b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.cpp @@ -365,6 +365,14 @@ static Cursor maybeLexIRValue(Cursor C, MIToken &Token, return lexName(C, Token, MIToken::NamedIRValue, Rule.size(), ErrorCallback); } +static Cursor maybeLexStringConstant(Cursor C, MIToken &Token, + ErrorCallbackType ErrorCallback) { + if (C.peek() != '"') + return None; + return lexName(C, Token, MIToken::StringConstant, /*PrefixLength=*/0, + ErrorCallback); +} + static Cursor lexVirtualRegister(Cursor C, MIToken &Token) { auto Range = C; C.advance(); // Skip '%' @@ -630,6 +638,8 @@ StringRef llvm::lexMIToken(StringRef Source, MIToken &Token, return R.remaining(); if (Cursor R = maybeLexEscapedIRValue(C, Token, ErrorCallback)) return R.remaining(); + if (Cursor R = maybeLexStringConstant(C, Token, ErrorCallback)) + return R.remaining(); Token.reset(MIToken::Error, C.remaining()); ErrorCallback(C.location(), diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MILexer.h b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.h index edba749..08b82e5 100644 --- a/contrib/llvm/lib/CodeGen/MIRParser/MILexer.h +++ b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.h @@ -16,8 +16,8 @@ #define LLVM_LIB_CODEGEN_MIRPARSER_MILEXER_H #include "llvm/ADT/APSInt.h" -#include "llvm/ADT/StringRef.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" #include <functional> namespace llvm { @@ -127,7 +127,8 @@ struct MIToken { NamedIRValue, IRValue, QuotedIRValue, // `<constant value>` - SubRegisterIndex + SubRegisterIndex, + StringConstant }; private: @@ -168,7 +169,8 @@ public: bool isMemoryOperandFlag() const { return Kind == kw_volatile || Kind == kw_non_temporal || - Kind == kw_dereferenceable || Kind == kw_invariant; + Kind == kw_dereferenceable || Kind == kw_invariant || + Kind == StringConstant; } bool is(TokenKind K) const { return Kind == K; } diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp index c8bed08..c68d87b 100644 --- a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp +++ b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp @@ -11,12 +11,22 @@ // //===----------------------------------------------------------------------===// -#include "MIParser.h" #include "MILexer.h" +#include "MIParser.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/APSInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" #include "llvm/AsmParser/Parser.h" #include "llvm/AsmParser/SlotMapping.h" +#include "llvm/CodeGen/MIRPrinter.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -24,25 +34,57 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/ModuleSlotTracker.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" #include "llvm/IR/ValueSymbolTable.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/MC/MCDwarf.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/AtomicOrdering.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/LowLevelTypeImpl.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/SMLoc.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetIntrinsicInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> #include <cctype> +#include <cstddef> +#include <cstdint> +#include <limits> +#include <string> +#include <utility> using namespace llvm; PerFunctionMIParsingState::PerFunctionMIParsingState(MachineFunction &MF, - SourceMgr &SM, const SlotMapping &IRSlots) - : MF(MF), SM(&SM), IRSlots(IRSlots) { + SourceMgr &SM, const SlotMapping &IRSlots, + const Name2RegClassMap &Names2RegClasses, + const Name2RegBankMap &Names2RegBanks) + : MF(MF), SM(&SM), IRSlots(IRSlots), Names2RegClasses(Names2RegClasses), + Names2RegBanks(Names2RegBanks) { } VRegInfo &PerFunctionMIParsingState::getVRegInfo(unsigned Num) { @@ -99,6 +141,8 @@ class MIParser { StringMap<unsigned> Names2DirectTargetFlags; /// Maps from direct target flag names to the bitmask target flag values. StringMap<unsigned> Names2BitmaskTargetFlags; + /// Maps from MMO target flag names to MMO target flag values. + StringMap<MachineMemOperand::Flags> Names2MMOTargetFlags; public: MIParser(PerFunctionMIParsingState &PFS, SMDiagnostic &Error, @@ -131,7 +175,8 @@ public: bool parseBasicBlockDefinition(DenseMap<unsigned, MachineBasicBlock *> &MBBSlots); - bool parseBasicBlock(MachineBasicBlock &MBB); + bool parseBasicBlock(MachineBasicBlock &MBB, + MachineBasicBlock *&AddFalthroughFrom); bool parseBasicBlockLiveins(MachineBasicBlock &MBB); bool parseBasicBlockSuccessors(MachineBasicBlock &MBB); @@ -139,6 +184,7 @@ public: bool parseVirtualRegister(VRegInfo *&Info); bool parseRegister(unsigned &Reg, VRegInfo *&VRegInfo); bool parseRegisterFlag(unsigned &Flags); + bool parseRegisterClassOrBank(VRegInfo &RegInfo); bool parseSubRegisterIndex(unsigned &SubReg); bool parseRegisterTiedDefIndex(unsigned &TiedDefIdx); bool parseRegisterOperand(MachineOperand &Dest, @@ -172,6 +218,7 @@ public: bool parseIntrinsicOperand(MachineOperand &Dest); bool parsePredicateOperand(MachineOperand &Dest); bool parseTargetIndexOperand(MachineOperand &Dest); + bool parseCustomRegisterMaskOperand(MachineOperand &Dest); bool parseLiveoutRegisterMaskOperand(MachineOperand &Dest); bool parseMachineOperand(MachineOperand &Dest, Optional<unsigned> &TiedDefIdx); @@ -184,6 +231,8 @@ public: bool parseMemoryOperandFlag(MachineMemOperand::Flags &Flags); bool parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV); bool parseMachinePointerInfo(MachinePointerInfo &Dest); + bool parseOptionalScope(LLVMContext &Context, SyncScope::ID &SSID); + bool parseOptionalAtomicOrdering(AtomicOrdering &Order); bool parseMachineMemoryOperand(MachineMemOperand *&Dest); private: @@ -272,6 +321,18 @@ private: /// /// Return true if the name isn't a name of a bitmask target flag. bool getBitmaskTargetFlag(StringRef Name, unsigned &Flag); + + void initNames2MMOTargetFlags(); + + /// Try to convert a name of a MachineMemOperand target flag to the + /// corresponding target flag. + /// + /// Return true if the name isn't a name of a target MMO flag. + bool getMMOTargetFlag(StringRef Name, MachineMemOperand::Flags &Flag); + + /// parseStringConstant + /// ::= StringConstant + bool parseStringConstant(std::string &Result); }; } // end anonymous namespace @@ -512,7 +573,8 @@ bool MIParser::parseBasicBlockSuccessors(MachineBasicBlock &MBB) { return false; } -bool MIParser::parseBasicBlock(MachineBasicBlock &MBB) { +bool MIParser::parseBasicBlock(MachineBasicBlock &MBB, + MachineBasicBlock *&AddFalthroughFrom) { // Skip the definition. assert(Token.is(MIToken::MachineBasicBlockLabel)); lex(); @@ -532,10 +594,12 @@ bool MIParser::parseBasicBlock(MachineBasicBlock &MBB) { // // is equivalent to // liveins: %edi, %esi + bool ExplicitSuccessors = false; while (true) { if (Token.is(MIToken::kw_successors)) { if (parseBasicBlockSuccessors(MBB)) return true; + ExplicitSuccessors = true; } else if (Token.is(MIToken::kw_liveins)) { if (parseBasicBlockLiveins(MBB)) return true; @@ -551,10 +615,9 @@ bool MIParser::parseBasicBlock(MachineBasicBlock &MBB) { // Parse the instructions. bool IsInBundle = false; MachineInstr *PrevMI = nullptr; - while (true) { - if (Token.is(MIToken::MachineBasicBlockLabel) || Token.is(MIToken::Eof)) - return false; - else if (consumeIfPresent(MIToken::Newline)) + while (!Token.is(MIToken::MachineBasicBlockLabel) && + !Token.is(MIToken::Eof)) { + if (consumeIfPresent(MIToken::Newline)) continue; if (consumeIfPresent(MIToken::rbrace)) { // The first parsing pass should verify that all closing '}' have an @@ -586,6 +649,22 @@ bool MIParser::parseBasicBlock(MachineBasicBlock &MBB) { assert(Token.isNewlineOrEOF() && "MI is not fully parsed"); lex(); } + + // Construct successor list by searching for basic block machine operands. + if (!ExplicitSuccessors) { + SmallVector<MachineBasicBlock*,4> Successors; + bool IsFallthrough; + guessSuccessors(MBB, Successors, IsFallthrough); + for (MachineBasicBlock *Succ : Successors) + MBB.addSuccessor(Succ); + + if (IsFallthrough) { + AddFalthroughFrom = &MBB; + } else { + MBB.normalizeSuccProbs(); + } + } + return false; } @@ -599,11 +678,18 @@ bool MIParser::parseBasicBlocks() { // The first parsing pass should have verified that this token is a MBB label // in the 'parseBasicBlockDefinitions' method. assert(Token.is(MIToken::MachineBasicBlockLabel)); + MachineBasicBlock *AddFalthroughFrom = nullptr; do { MachineBasicBlock *MBB = nullptr; if (parseMBBReference(MBB)) return true; - if (parseBasicBlock(*MBB)) + if (AddFalthroughFrom) { + if (!AddFalthroughFrom->isSuccessor(MBB)) + AddFalthroughFrom->addSuccessor(MBB); + AddFalthroughFrom->normalizeSuccProbs(); + AddFalthroughFrom = nullptr; + } + if (parseBasicBlock(*MBB, AddFalthroughFrom)) return true; // The method 'parseBasicBlock' should parse the whole block until the next // block or the end of file. @@ -878,6 +964,66 @@ bool MIParser::parseRegister(unsigned &Reg, VRegInfo *&Info) { } } +bool MIParser::parseRegisterClassOrBank(VRegInfo &RegInfo) { + if (Token.isNot(MIToken::Identifier) && Token.isNot(MIToken::underscore)) + return error("expected '_', register class, or register bank name"); + StringRef::iterator Loc = Token.location(); + StringRef Name = Token.stringValue(); + + // Was it a register class? + auto RCNameI = PFS.Names2RegClasses.find(Name); + if (RCNameI != PFS.Names2RegClasses.end()) { + lex(); + const TargetRegisterClass &RC = *RCNameI->getValue(); + + switch (RegInfo.Kind) { + case VRegInfo::UNKNOWN: + case VRegInfo::NORMAL: + RegInfo.Kind = VRegInfo::NORMAL; + if (RegInfo.Explicit && RegInfo.D.RC != &RC) { + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + return error(Loc, Twine("conflicting register classes, previously: ") + + Twine(TRI.getRegClassName(RegInfo.D.RC))); + } + RegInfo.D.RC = &RC; + RegInfo.Explicit = true; + return false; + + case VRegInfo::GENERIC: + case VRegInfo::REGBANK: + return error(Loc, "register class specification on generic register"); + } + llvm_unreachable("Unexpected register kind"); + } + + // Should be a register bank or a generic register. + const RegisterBank *RegBank = nullptr; + if (Name != "_") { + auto RBNameI = PFS.Names2RegBanks.find(Name); + if (RBNameI == PFS.Names2RegBanks.end()) + return error(Loc, "expected '_', register class, or register bank name"); + RegBank = RBNameI->getValue(); + } + + lex(); + + switch (RegInfo.Kind) { + case VRegInfo::UNKNOWN: + case VRegInfo::GENERIC: + case VRegInfo::REGBANK: + RegInfo.Kind = RegBank ? VRegInfo::REGBANK : VRegInfo::GENERIC; + if (RegInfo.Explicit && RegInfo.D.RegBank != RegBank) + return error(Loc, "conflicting generic register banks"); + RegInfo.D.RegBank = RegBank; + RegInfo.Explicit = true; + return false; + + case VRegInfo::NORMAL: + return error(Loc, "register bank specification on normal register"); + } + llvm_unreachable("Unexpected register kind"); +} + bool MIParser::parseRegisterFlag(unsigned &Flags) { const unsigned OldFlags = Flags; switch (Token.kind()) { @@ -1004,6 +1150,13 @@ bool MIParser::parseRegisterOperand(MachineOperand &Dest, if (!TargetRegisterInfo::isVirtualRegister(Reg)) return error("subregister index expects a virtual register"); } + if (Token.is(MIToken::colon)) { + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + return error("register class specification expects a virtual register"); + lex(); + if (parseRegisterClassOrBank(*RegInfo)) + return true; + } MachineRegisterInfo &MRI = MF.getRegInfo(); if ((Flags & RegState::Define) == 0) { if (consumeIfPresent(MIToken::lparen)) { @@ -1598,6 +1751,35 @@ bool MIParser::parseTargetIndexOperand(MachineOperand &Dest) { return false; } +bool MIParser::parseCustomRegisterMaskOperand(MachineOperand &Dest) { + assert(Token.stringValue() == "CustomRegMask" && "Expected a custom RegMask"); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + assert(TRI && "Expected target register info"); + lex(); + if (expectAndConsume(MIToken::lparen)) + return true; + + uint32_t *Mask = MF.allocateRegisterMask(TRI->getNumRegs()); + while (true) { + if (Token.isNot(MIToken::NamedRegister)) + return error("expected a named register"); + unsigned Reg; + if (parseNamedRegister(Reg)) + return true; + lex(); + Mask[Reg / 32] |= 1U << (Reg % 32); + // TODO: Report an error if the same register is used more than once. + if (Token.isNot(MIToken::comma)) + break; + lex(); + } + + if (expectAndConsume(MIToken::rparen)) + return true; + Dest = MachineOperand::CreateRegMask(Mask); + return false; +} + bool MIParser::parseLiveoutRegisterMaskOperand(MachineOperand &Dest) { assert(Token.is(MIToken::kw_liveout)); const auto *TRI = MF.getSubtarget().getRegisterInfo(); @@ -1695,8 +1877,8 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest, Dest = MachineOperand::CreateRegMask(RegMask); lex(); break; - } - LLVM_FALLTHROUGH; + } else + return parseCustomRegisterMaskOperand(Dest); default: // FIXME: Parse the MCSymbol machine operand. return error("expected a machine operand"); @@ -1867,7 +2049,14 @@ bool MIParser::parseMemoryOperandFlag(MachineMemOperand::Flags &Flags) { case MIToken::kw_invariant: Flags |= MachineMemOperand::MOInvariant; break; - // TODO: parse the target specific memory operand flags. + case MIToken::StringConstant: { + MachineMemOperand::Flags TF; + if (getMMOTargetFlag(Token.stringValue(), TF)) + return error("use of undefined target MMO flag '" + Token.stringValue() + + "'"); + Flags |= TF; + break; + } default: llvm_unreachable("The current token should be a memory operand flag"); } @@ -1909,7 +2098,7 @@ bool MIParser::parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV) { // The token was already consumed, so use return here instead of break. return false; } - case MIToken::kw_call_entry: { + case MIToken::kw_call_entry: lex(); switch (Token.kind()) { case MIToken::GlobalValue: @@ -1929,7 +2118,6 @@ bool MIParser::parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV) { "expected a global value or an external symbol after 'call-entry'"); } break; - } default: llvm_unreachable("The current token should be pseudo source value"); } @@ -1969,6 +2157,48 @@ bool MIParser::parseMachinePointerInfo(MachinePointerInfo &Dest) { return false; } +bool MIParser::parseOptionalScope(LLVMContext &Context, + SyncScope::ID &SSID) { + SSID = SyncScope::System; + if (Token.is(MIToken::Identifier) && Token.stringValue() == "syncscope") { + lex(); + if (expectAndConsume(MIToken::lparen)) + return error("expected '(' in syncscope"); + + std::string SSN; + if (parseStringConstant(SSN)) + return true; + + SSID = Context.getOrInsertSyncScopeID(SSN); + if (expectAndConsume(MIToken::rparen)) + return error("expected ')' in syncscope"); + } + + return false; +} + +bool MIParser::parseOptionalAtomicOrdering(AtomicOrdering &Order) { + Order = AtomicOrdering::NotAtomic; + if (Token.isNot(MIToken::Identifier)) + return false; + + Order = StringSwitch<AtomicOrdering>(Token.stringValue()) + .Case("unordered", AtomicOrdering::Unordered) + .Case("monotonic", AtomicOrdering::Monotonic) + .Case("acquire", AtomicOrdering::Acquire) + .Case("release", AtomicOrdering::Release) + .Case("acq_rel", AtomicOrdering::AcquireRelease) + .Case("seq_cst", AtomicOrdering::SequentiallyConsistent) + .Default(AtomicOrdering::NotAtomic); + + if (Order != AtomicOrdering::NotAtomic) { + lex(); + return false; + } + + return error("expected an atomic scope, ordering or a size integer literal"); +} + bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) { if (expectAndConsume(MIToken::lparen)) return true; @@ -1986,6 +2216,19 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) { Flags |= MachineMemOperand::MOStore; lex(); + // Optional synchronization scope. + SyncScope::ID SSID; + if (parseOptionalScope(MF.getFunction()->getContext(), SSID)) + return true; + + // Up to two atomic orderings (cmpxchg provides guarantees on failure). + AtomicOrdering Order, FailureOrder; + if (parseOptionalAtomicOrdering(Order)) + return true; + + if (parseOptionalAtomicOrdering(FailureOrder)) + return true; + if (Token.isNot(MIToken::IntegerLiteral)) return error("expected the size integer literal after memory operation"); uint64_t Size; @@ -2040,8 +2283,8 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) { } if (expectAndConsume(MIToken::rparen)) return true; - Dest = - MF.getMachineMemOperand(Ptr, Flags, Size, BaseAlignment, AAInfo, Range); + Dest = MF.getMachineMemOperand(Ptr, Flags, Size, BaseAlignment, AAInfo, Range, + SSID, Order, FailureOrder); return false; } @@ -2254,6 +2497,35 @@ bool MIParser::getBitmaskTargetFlag(StringRef Name, unsigned &Flag) { return false; } +void MIParser::initNames2MMOTargetFlags() { + if (!Names2MMOTargetFlags.empty()) + return; + const auto *TII = MF.getSubtarget().getInstrInfo(); + assert(TII && "Expected target instruction info"); + auto Flags = TII->getSerializableMachineMemOperandTargetFlags(); + for (const auto &I : Flags) + Names2MMOTargetFlags.insert( + std::make_pair(StringRef(I.second), I.first)); +} + +bool MIParser::getMMOTargetFlag(StringRef Name, + MachineMemOperand::Flags &Flag) { + initNames2MMOTargetFlags(); + auto FlagInfo = Names2MMOTargetFlags.find(Name); + if (FlagInfo == Names2MMOTargetFlags.end()) + return true; + Flag = FlagInfo->second; + return false; +} + +bool MIParser::parseStringConstant(std::string &Result) { + if (Token.isNot(MIToken::StringConstant)) + return error("expected string constant"); + Result = Token.stringValue(); + lex(); + return false; +} + bool llvm::parseMachineBasicBlockDefinitions(PerFunctionMIParsingState &PFS, StringRef Src, SMDiagnostic &Error) { diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h index 93a4d84..2307881 100644 --- a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h +++ b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h @@ -1,4 +1,4 @@ -//===- MIParser.h - Machine Instructions Parser ---------------------------===// +//===- MIParser.h - Machine Instructions Parser -----------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -15,21 +15,19 @@ #define LLVM_LIB_CODEGEN_MIRPARSER_MIPARSER_H #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/Support/Allocator.h" namespace llvm { -class StringRef; -class BasicBlock; class MachineBasicBlock; class MachineFunction; -class MachineInstr; -class MachineRegisterInfo; class MDNode; class RegisterBank; struct SlotMapping; class SMDiagnostic; class SourceMgr; +class StringRef; class TargetRegisterClass; struct VRegInfo { @@ -45,11 +43,16 @@ struct VRegInfo { unsigned PreferredReg = 0; }; +using Name2RegClassMap = StringMap<const TargetRegisterClass *>; +using Name2RegBankMap = StringMap<const RegisterBank *>; + struct PerFunctionMIParsingState { BumpPtrAllocator Allocator; MachineFunction &MF; SourceMgr *SM; const SlotMapping &IRSlots; + const Name2RegClassMap &Names2RegClasses; + const Name2RegBankMap &Names2RegBanks; DenseMap<unsigned, MachineBasicBlock *> MBBSlots; DenseMap<unsigned, VRegInfo*> VRegInfos; @@ -59,7 +62,9 @@ struct PerFunctionMIParsingState { DenseMap<unsigned, unsigned> JumpTableSlots; PerFunctionMIParsingState(MachineFunction &MF, SourceMgr &SM, - const SlotMapping &IRSlots); + const SlotMapping &IRSlots, + const Name2RegClassMap &Names2RegClasses, + const Name2RegBankMap &Names2RegBanks); VRegInfo &getVRegInfo(unsigned VReg); }; @@ -115,4 +120,4 @@ bool parseMDNode(PerFunctionMIParsingState &PFS, MDNode *&Node, StringRef Src, } // end namespace llvm -#endif +#endif // LLVM_LIB_CODEGEN_MIRPARSER_MIPARSER_H diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index 3dff114..78b57f3 100644 --- a/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -50,18 +50,24 @@ namespace llvm { /// file. class MIRParserImpl { SourceMgr SM; + yaml::Input In; StringRef Filename; LLVMContext &Context; - StringMap<std::unique_ptr<yaml::MachineFunction>> Functions; SlotMapping IRSlots; /// Maps from register class names to register classes. - StringMap<const TargetRegisterClass *> Names2RegClasses; + Name2RegClassMap Names2RegClasses; /// Maps from register bank names to register banks. - StringMap<const RegisterBank *> Names2RegBanks; + Name2RegBankMap Names2RegBanks; + /// True when the MIR file doesn't have LLVM IR. Dummy IR functions are + /// created and inserted into the given module when this is true. + bool NoLLVMIR = false; + /// True when a well formed MIR file does not contain any MIR/machine function + /// parts. + bool NoMIRDocuments = false; public: - MIRParserImpl(std::unique_ptr<MemoryBuffer> Contents, StringRef Filename, - LLVMContext &Context); + MIRParserImpl(std::unique_ptr<MemoryBuffer> Contents, + StringRef Filename, LLVMContext &Context); void reportDiagnostic(const SMDiagnostic &Diag); @@ -85,22 +91,22 @@ public: /// file. /// /// Return null if an error occurred. - std::unique_ptr<Module> parse(); + std::unique_ptr<Module> parseIRModule(); + + bool parseMachineFunctions(Module &M, MachineModuleInfo &MMI); /// Parse the machine function in the current YAML document. /// - /// \param NoLLVMIR - set to true when the MIR file doesn't have LLVM IR. - /// A dummy IR function is created and inserted into the given module when - /// this parameter is true. /// /// Return true if an error occurred. - bool parseMachineFunction(yaml::Input &In, Module &M, bool NoLLVMIR); + bool parseMachineFunction(Module &M, MachineModuleInfo &MMI); /// Initialize the machine function to the state that's described in the MIR /// file. /// /// Return true if error occurred. - bool initializeMachineFunction(MachineFunction &MF); + bool initializeMachineFunction(const yaml::MachineFunction &YamlMF, + MachineFunction &MF); bool parseRegisterInfo(PerFunctionMIParsingState &PFS, const yaml::MachineFunction &YamlMF); @@ -144,9 +150,6 @@ private: SMDiagnostic diagFromBlockStringDiag(const SMDiagnostic &Error, SMRange SourceRange); - /// Create an empty function with the given name. - void createDummyFunction(StringRef Name, Module &M); - void initNames2RegClasses(const MachineFunction &MF); void initNames2RegBanks(const MachineFunction &MF); @@ -166,10 +169,19 @@ private: } // end namespace llvm +static void handleYAMLDiag(const SMDiagnostic &Diag, void *Context) { + reinterpret_cast<MIRParserImpl *>(Context)->reportDiagnostic(Diag); +} + MIRParserImpl::MIRParserImpl(std::unique_ptr<MemoryBuffer> Contents, StringRef Filename, LLVMContext &Context) - : SM(), Filename(Filename), Context(Context) { - SM.AddNewSourceBuffer(std::move(Contents), SMLoc()); + : SM(), + In(SM.getMemoryBuffer( + SM.AddNewSourceBuffer(std::move(Contents), SMLoc()))->getBuffer(), + nullptr, handleYAMLDiag, this), + Filename(Filename), + Context(Context) { + In.setContext(&In); } bool MIRParserImpl::error(const Twine &Message) { @@ -206,24 +218,16 @@ void MIRParserImpl::reportDiagnostic(const SMDiagnostic &Diag) { Context.diagnose(DiagnosticInfoMIRParser(Kind, Diag)); } -static void handleYAMLDiag(const SMDiagnostic &Diag, void *Context) { - reinterpret_cast<MIRParserImpl *>(Context)->reportDiagnostic(Diag); -} - -std::unique_ptr<Module> MIRParserImpl::parse() { - yaml::Input In(SM.getMemoryBuffer(SM.getMainFileID())->getBuffer(), - /*Ctxt=*/nullptr, handleYAMLDiag, this); - In.setContext(&In); - +std::unique_ptr<Module> MIRParserImpl::parseIRModule() { if (!In.setCurrentDocument()) { if (In.error()) return nullptr; // Create an empty module when the MIR file is empty. + NoMIRDocuments = true; return llvm::make_unique<Module>(Filename, Context); } std::unique_ptr<Module> M; - bool NoLLVMIR = false; // Parse the block scalar manually so that we can return unique pointer // without having to go trough YAML traits. if (const auto *BSN = @@ -237,49 +241,68 @@ std::unique_ptr<Module> MIRParserImpl::parse() { } In.nextDocument(); if (!In.setCurrentDocument()) - return M; + NoMIRDocuments = true; } else { // Create an new, empty module. M = llvm::make_unique<Module>(Filename, Context); NoLLVMIR = true; } + return M; +} + +bool MIRParserImpl::parseMachineFunctions(Module &M, MachineModuleInfo &MMI) { + if (NoMIRDocuments) + return false; // Parse the machine functions. do { - if (parseMachineFunction(In, *M, NoLLVMIR)) - return nullptr; + if (parseMachineFunction(M, MMI)) + return true; In.nextDocument(); } while (In.setCurrentDocument()); - return M; -} - -bool MIRParserImpl::parseMachineFunction(yaml::Input &In, Module &M, - bool NoLLVMIR) { - auto MF = llvm::make_unique<yaml::MachineFunction>(); - yaml::EmptyContext Ctx; - yaml::yamlize(In, *MF, false, Ctx); - if (In.error()) - return true; - auto FunctionName = MF->Name; - if (Functions.find(FunctionName) != Functions.end()) - return error(Twine("redefinition of machine function '") + FunctionName + - "'"); - Functions.insert(std::make_pair(FunctionName, std::move(MF))); - if (NoLLVMIR) - createDummyFunction(FunctionName, M); - else if (!M.getFunction(FunctionName)) - return error(Twine("function '") + FunctionName + - "' isn't defined in the provided LLVM IR"); return false; } -void MIRParserImpl::createDummyFunction(StringRef Name, Module &M) { +/// Create an empty function with the given name. +static Function *createDummyFunction(StringRef Name, Module &M) { auto &Context = M.getContext(); Function *F = cast<Function>(M.getOrInsertFunction( Name, FunctionType::get(Type::getVoidTy(Context), false))); BasicBlock *BB = BasicBlock::Create(Context, "entry", F); new UnreachableInst(Context, BB); + return F; +} + +bool MIRParserImpl::parseMachineFunction(Module &M, MachineModuleInfo &MMI) { + // Parse the yaml. + yaml::MachineFunction YamlMF; + yaml::EmptyContext Ctx; + yaml::yamlize(In, YamlMF, false, Ctx); + if (In.error()) + return true; + + // Search for the corresponding IR function. + StringRef FunctionName = YamlMF.Name; + Function *F = M.getFunction(FunctionName); + if (!F) { + if (NoLLVMIR) { + F = createDummyFunction(FunctionName, M); + } else { + return error(Twine("function '") + FunctionName + + "' isn't defined in the provided LLVM IR"); + } + } + if (MMI.getMachineFunction(*F) != nullptr) + return error(Twine("redefinition of machine function '") + FunctionName + + "'"); + + // Create the MachineFunction. + MachineFunction &MF = MMI.getOrCreateMachineFunction(*F); + if (initializeMachineFunction(YamlMF, MF)) + return true; + + return false; } static bool isSSA(const MachineFunction &MF) { @@ -319,13 +342,12 @@ void MIRParserImpl::computeFunctionProperties(MachineFunction &MF) { Properties.set(MachineFunctionProperties::Property::NoVRegs); } -bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) { - auto It = Functions.find(MF.getName()); - if (It == Functions.end()) - return error(Twine("no machine function information for function '") + - MF.getName() + "' in the MIR file"); +bool +MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF, + MachineFunction &MF) { // TODO: Recreate the machine function. - const yaml::MachineFunction &YamlMF = *It->getValue(); + initNames2RegClasses(MF); + initNames2RegBanks(MF); if (YamlMF.Alignment) MF.setAlignment(YamlMF.Alignment); MF.setExposesReturnsTwice(YamlMF.ExposesReturnsTwice); @@ -338,7 +360,8 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) { if (YamlMF.Selected) MF.getProperties().set(MachineFunctionProperties::Property::Selected); - PerFunctionMIParsingState PFS(MF, SM, IRSlots); + PerFunctionMIParsingState PFS(MF, SM, IRSlots, Names2RegClasses, + Names2RegBanks); if (parseRegisterInfo(PFS, YamlMF)) return true; if (!YamlMF.Constants.empty()) { @@ -362,9 +385,6 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) { } PFS.SM = &SM; - if (MF.empty()) - return error(Twine("machine function '") + Twine(MF.getName()) + - "' requires at least one machine basic block in its body"); // Initialize the frame information after creating all the MBBs so that the // MBB references in the frame information can be resolved. if (initializeFrameInfo(PFS, YamlMF)) @@ -462,17 +482,19 @@ bool MIRParserImpl::parseRegisterInfo(PerFunctionMIParsingState &PFS, RegInfo.addLiveIn(Reg, VReg); } - // Parse the callee saved register mask. - BitVector CalleeSavedRegisterMask(RegInfo.getUsedPhysRegsMask().size()); - if (!YamlMF.CalleeSavedRegisters) - return false; - for (const auto &RegSource : YamlMF.CalleeSavedRegisters.getValue()) { - unsigned Reg = 0; - if (parseNamedRegisterReference(PFS, Reg, RegSource.Value, Error)) - return error(Error, RegSource.SourceRange); - CalleeSavedRegisterMask[Reg] = true; + // Parse the callee saved registers (Registers that will + // be saved for the caller). + if (YamlMF.CalleeSavedRegisters) { + SmallVector<MCPhysReg, 16> CalleeSavedRegisters; + for (const auto &RegSource : YamlMF.CalleeSavedRegisters.getValue()) { + unsigned Reg = 0; + if (parseNamedRegisterReference(PFS, Reg, RegSource.Value, Error)) + return error(Error, RegSource.SourceRange); + CalleeSavedRegisters.push_back(Reg); + } + RegInfo.setCalleeSavedRegs(CalleeSavedRegisters); } - RegInfo.setUsedPhysRegMask(CalleeSavedRegisterMask.flip()); + return false; } @@ -505,14 +527,12 @@ bool MIRParserImpl::setupRegisterInfo(const PerFunctionMIParsingState &PFS, } // Compute MachineRegisterInfo::UsedPhysRegMask - if (!YamlMF.CalleeSavedRegisters) { - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { - for (const MachineOperand &MO : MI.operands()) { - if (!MO.isRegMask()) - continue; - MRI.addPhysRegsUsedFromRegMask(MO.getRegMask()); - } + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isRegMask()) + continue; + MRI.addPhysRegsUsedFromRegMask(MO.getRegMask()); } } } @@ -539,7 +559,8 @@ bool MIRParserImpl::initializeFrameInfo(PerFunctionMIParsingState &PFS, MFI.ensureMaxAlignment(YamlMFI.MaxAlignment); MFI.setAdjustsStack(YamlMFI.AdjustsStack); MFI.setHasCalls(YamlMFI.HasCalls); - MFI.setMaxCallFrameSize(YamlMFI.MaxCallFrameSize); + if (YamlMFI.MaxCallFrameSize != ~0u) + MFI.setMaxCallFrameSize(YamlMFI.MaxCallFrameSize); MFI.setHasOpaqueSPAdjustment(YamlMFI.HasOpaqueSPAdjustment); MFI.setHasVAStart(YamlMFI.HasVAStart); MFI.setHasMustTailInVarArgFunc(YamlMFI.HasMustTailInVarArgFunc); @@ -818,7 +839,6 @@ void MIRParserImpl::initNames2RegBanks(const MachineFunction &MF) { const TargetRegisterClass *MIRParserImpl::getRegClass(const MachineFunction &MF, StringRef Name) { - initNames2RegClasses(MF); auto RegClassInfo = Names2RegClasses.find(Name); if (RegClassInfo == Names2RegClasses.end()) return nullptr; @@ -827,7 +847,6 @@ const TargetRegisterClass *MIRParserImpl::getRegClass(const MachineFunction &MF, const RegisterBank *MIRParserImpl::getRegBank(const MachineFunction &MF, StringRef Name) { - initNames2RegBanks(MF); auto RegBankInfo = Names2RegBanks.find(Name); if (RegBankInfo == Names2RegBanks.end()) return nullptr; @@ -839,16 +858,18 @@ MIRParser::MIRParser(std::unique_ptr<MIRParserImpl> Impl) MIRParser::~MIRParser() {} -std::unique_ptr<Module> MIRParser::parseLLVMModule() { return Impl->parse(); } +std::unique_ptr<Module> MIRParser::parseIRModule() { + return Impl->parseIRModule(); +} -bool MIRParser::initializeMachineFunction(MachineFunction &MF) { - return Impl->initializeMachineFunction(MF); +bool MIRParser::parseMachineFunctions(Module &M, MachineModuleInfo &MMI) { + return Impl->parseMachineFunctions(M, MMI); } std::unique_ptr<MIRParser> llvm::createMIRParserFromFile(StringRef Filename, SMDiagnostic &Error, LLVMContext &Context) { - auto FileOrErr = MemoryBuffer::getFile(Filename); + auto FileOrErr = MemoryBuffer::getFileOrSTDIN(Filename); if (std::error_code EC = FileOrErr.getError()) { Error = SMDiagnostic(Filename, SourceMgr::DK_Error, "Could not open input file: " + EC.message()); diff --git a/contrib/llvm/lib/CodeGen/MIRPrinter.cpp b/contrib/llvm/lib/CodeGen/MIRPrinter.cpp index db87092..ddeacf1 100644 --- a/contrib/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/contrib/llvm/lib/CodeGen/MIRPrinter.cpp @@ -12,36 +12,72 @@ // //===----------------------------------------------------------------------===// -#include "MIRPrinter.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/None.h" #include "llvm/ADT/SmallBitVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" #include "llvm/CodeGen/GlobalISel/RegisterBank.h" -#include "llvm/CodeGen/MIRYamlMapping.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" -#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MIRPrinter.h" +#include "llvm/CodeGen/MIRYamlMapping.h" +#include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfo.h" -#include "llvm/IR/IRPrintingPasses.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/Module.h" #include "llvm/IR/ModuleSlotTracker.h" +#include "llvm/IR/Value.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/AtomicOrdering.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" -#include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/YAMLTraits.h" +#include "llvm/Support/LowLevelTypeImpl.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/YAMLTraits.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetIntrinsicInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cinttypes> +#include <cstdint> +#include <iterator> +#include <string> +#include <utility> +#include <vector> using namespace llvm; +static cl::opt<bool> SimplifyMIR("simplify-mir", + cl::desc("Leave out unnecessary information when printing MIR")); + namespace { /// This structure describes how to print out stack object references. @@ -104,6 +140,11 @@ class MIPrinter { ModuleSlotTracker &MST; const DenseMap<const uint32_t *, unsigned> &RegisterMaskIds; const DenseMap<int, FrameIndexOperand> &StackObjectOperandMapping; + /// Synchronization scope names registered with LLVMContext. + SmallVector<StringRef, 8> SSNs; + + bool canPredictBranchProbabilities(const MachineBasicBlock &MBB) const; + bool canPredictSuccessors(const MachineBasicBlock &MBB) const; public: MIPrinter(raw_ostream &OS, ModuleSlotTracker &MST, @@ -124,7 +165,9 @@ public: void print(const MachineOperand &Op, const TargetRegisterInfo *TRI, unsigned I, bool ShouldPrintRegisterTies, LLT TypeToPrint, bool IsDef = false); - void print(const MachineMemOperand &Op); + void print(const LLVMContext &Context, const TargetInstrInfo &TII, + const MachineMemOperand &Op); + void printSyncScope(const LLVMContext &Context, SyncScope::ID SSID); void print(const MCCFIInstruction &CFI, const TargetRegisterInfo *TRI); }; @@ -139,6 +182,7 @@ template <> struct BlockScalarTraits<Module> { static void output(const Module &Mod, void *Ctxt, raw_ostream &OS) { Mod.print(OS, nullptr); } + static StringRef input(StringRef Str, void *Ctxt, Module &Mod) { llvm_unreachable("LLVM Module is supposed to be parsed separately"); return ""; @@ -202,9 +246,30 @@ void MIRPrinter::print(const MachineFunction &MF) { } StrOS.flush(); yaml::Output Out(OS); + if (!SimplifyMIR) + Out.setWriteDefaultValues(true); Out << YamlMF; } +static void printCustomRegMask(const uint32_t *RegMask, raw_ostream &OS, + const TargetRegisterInfo *TRI) { + assert(RegMask && "Can't print an empty register mask"); + OS << StringRef("CustomRegMask("); + + bool IsRegInRegMaskFound = false; + for (int I = 0, E = TRI->getNumRegs(); I < E; I++) { + // Check whether the register is asserted in regmask. + if (RegMask[I / 32] & (1u << (I % 32))) { + if (IsRegInRegMaskFound) + OS << ','; + printReg(I, OS, TRI); + IsRegInRegMaskFound = true; + } + } + + OS << ')'; +} + void MIRPrinter::convert(yaml::MachineFunction &MF, const MachineRegisterInfo &RegInfo, const TargetRegisterInfo *TRI) { @@ -239,20 +304,18 @@ void MIRPrinter::convert(yaml::MachineFunction &MF, printReg(I->second, LiveIn.VirtualRegister, TRI); MF.LiveIns.push_back(LiveIn); } - // The used physical register mask is printed as an inverted callee saved - // register mask. - const BitVector &UsedPhysRegMask = RegInfo.getUsedPhysRegsMask(); - if (UsedPhysRegMask.none()) - return; - std::vector<yaml::FlowStringValue> CalleeSavedRegisters; - for (unsigned I = 0, E = UsedPhysRegMask.size(); I != E; ++I) { - if (!UsedPhysRegMask[I]) { + + // Prints the callee saved registers. + if (RegInfo.isUpdatedCSRsInitialized()) { + const MCPhysReg *CalleeSavedRegs = RegInfo.getCalleeSavedRegs(); + std::vector<yaml::FlowStringValue> CalleeSavedRegisters; + for (const MCPhysReg *I = CalleeSavedRegs; *I; ++I) { yaml::FlowStringValue Reg; - printReg(I, Reg, TRI); + printReg(*I, Reg, TRI); CalleeSavedRegisters.push_back(Reg); } + MF.CalleeSavedRegisters = CalleeSavedRegisters; } - MF.CalleeSavedRegisters = CalleeSavedRegisters; } void MIRPrinter::convert(ModuleSlotTracker &MST, @@ -267,7 +330,8 @@ void MIRPrinter::convert(ModuleSlotTracker &MST, YamlMFI.MaxAlignment = MFI.getMaxAlignment(); YamlMFI.AdjustsStack = MFI.adjustsStack(); YamlMFI.HasCalls = MFI.hasCalls(); - YamlMFI.MaxCallFrameSize = MFI.getMaxCallFrameSize(); + YamlMFI.MaxCallFrameSize = MFI.isMaxCallFrameSizeComputed() + ? MFI.getMaxCallFrameSize() : ~0u; YamlMFI.HasOpaqueSPAdjustment = MFI.hasOpaqueSPAdjustment(); YamlMFI.HasVAStart = MFI.hasVAStart(); YamlMFI.HasMustTailInVarArgFunc = MFI.hasMustTailInVarArgFunc(); @@ -434,6 +498,62 @@ void MIRPrinter::initRegisterMaskIds(const MachineFunction &MF) { RegisterMaskIds.insert(std::make_pair(Mask, I++)); } +void llvm::guessSuccessors(const MachineBasicBlock &MBB, + SmallVectorImpl<MachineBasicBlock*> &Result, + bool &IsFallthrough) { + SmallPtrSet<MachineBasicBlock*,8> Seen; + + for (const MachineInstr &MI : MBB) { + if (MI.isPHI()) + continue; + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isMBB()) + continue; + MachineBasicBlock *Succ = MO.getMBB(); + auto RP = Seen.insert(Succ); + if (RP.second) + Result.push_back(Succ); + } + } + MachineBasicBlock::const_iterator I = MBB.getLastNonDebugInstr(); + IsFallthrough = I == MBB.end() || !I->isBarrier(); +} + +bool +MIPrinter::canPredictBranchProbabilities(const MachineBasicBlock &MBB) const { + if (MBB.succ_size() <= 1) + return true; + if (!MBB.hasSuccessorProbabilities()) + return true; + + SmallVector<BranchProbability,8> Normalized(MBB.Probs.begin(), + MBB.Probs.end()); + BranchProbability::normalizeProbabilities(Normalized.begin(), + Normalized.end()); + SmallVector<BranchProbability,8> Equal(Normalized.size()); + BranchProbability::normalizeProbabilities(Equal.begin(), Equal.end()); + + return std::equal(Normalized.begin(), Normalized.end(), Equal.begin()); +} + +bool MIPrinter::canPredictSuccessors(const MachineBasicBlock &MBB) const { + SmallVector<MachineBasicBlock*,8> GuessedSuccs; + bool GuessedFallthrough; + guessSuccessors(MBB, GuessedSuccs, GuessedFallthrough); + if (GuessedFallthrough) { + const MachineFunction &MF = *MBB.getParent(); + MachineFunction::const_iterator NextI = std::next(MBB.getIterator()); + if (NextI != MF.end()) { + MachineBasicBlock *Next = const_cast<MachineBasicBlock*>(&*NextI); + if (!is_contained(GuessedSuccs, Next)) + GuessedSuccs.push_back(Next); + } + } + if (GuessedSuccs.size() != MBB.succ_size()) + return false; + return std::equal(MBB.succ_begin(), MBB.succ_end(), GuessedSuccs.begin()); +} + void MIPrinter::print(const MachineBasicBlock &MBB) { assert(MBB.getNumber() >= 0 && "Invalid MBB number"); OS << "bb." << MBB.getNumber(); @@ -472,13 +592,15 @@ void MIPrinter::print(const MachineBasicBlock &MBB) { bool HasLineAttributes = false; // Print the successors - if (!MBB.succ_empty()) { + bool canPredictProbs = canPredictBranchProbabilities(MBB); + if (!MBB.succ_empty() && (!SimplifyMIR || !canPredictProbs || + !canPredictSuccessors(MBB))) { OS.indent(2) << "successors: "; for (auto I = MBB.succ_begin(), E = MBB.succ_end(); I != E; ++I) { if (I != MBB.succ_begin()) OS << ", "; printMBBReference(**I); - if (MBB.hasSuccessorProbabilities()) + if (!SimplifyMIR || !canPredictProbs) OS << '(' << format("0x%08" PRIx32, MBB.getSuccProbability(I).getNumerator()) << ')'; @@ -614,11 +736,12 @@ void MIPrinter::print(const MachineInstr &MI) { if (!MI.memoperands_empty()) { OS << " :: "; + const LLVMContext &Context = MF->getFunction()->getContext(); bool NeedComma = false; for (const auto *Op : MI.memoperands()) { if (NeedComma) OS << ", "; - print(*Op); + print(Context, *TII, *Op); NeedComma = true; } } @@ -823,7 +946,7 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI, OS << "%const." << Op.getIndex(); printOffset(Op.getOffset()); break; - case MachineOperand::MO_TargetIndex: { + case MachineOperand::MO_TargetIndex: OS << "target-index("; if (const auto *Name = getTargetIndexName( *Op.getParent()->getParent()->getParent(), Op.getIndex())) @@ -833,15 +956,20 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI, OS << ')'; printOffset(Op.getOffset()); break; - } case MachineOperand::MO_JumpTableIndex: OS << "%jump-table." << Op.getIndex(); break; - case MachineOperand::MO_ExternalSymbol: + case MachineOperand::MO_ExternalSymbol: { + StringRef Name = Op.getSymbolName(); OS << '$'; - printLLVMNameWithoutPrefix(OS, Op.getSymbolName()); + if (Name.empty()) { + OS << "\"\""; + } else { + printLLVMNameWithoutPrefix(OS, Name); + } printOffset(Op.getOffset()); break; + } case MachineOperand::MO_GlobalAddress: Op.getGlobal()->printAsOperand(OS, /*PrintType=*/false, MST); printOffset(Op.getOffset()); @@ -860,7 +988,7 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI, if (RegMaskInfo != RegisterMaskIds.end()) OS << StringRef(TRI->getRegMaskNames()[RegMaskInfo->second]).lower(); else - llvm_unreachable("Can't print this machine register mask yet."); + printCustomRegMask(Op.getRegMask(), OS, TRI); break; } case MachineOperand::MO_RegisterLiveOut: { @@ -909,9 +1037,20 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI, } } -void MIPrinter::print(const MachineMemOperand &Op) { +static const char *getTargetMMOFlagName(const TargetInstrInfo &TII, + unsigned TMMOFlag) { + auto Flags = TII.getSerializableMachineMemOperandTargetFlags(); + for (const auto &I : Flags) { + if (I.first == TMMOFlag) { + return I.second; + } + } + return nullptr; +} + +void MIPrinter::print(const LLVMContext &Context, const TargetInstrInfo &TII, + const MachineMemOperand &Op) { OS << '('; - // TODO: Print operand's target specific flags. if (Op.isVolatile()) OS << "volatile "; if (Op.isNonTemporal()) @@ -920,12 +1059,29 @@ void MIPrinter::print(const MachineMemOperand &Op) { OS << "dereferenceable "; if (Op.isInvariant()) OS << "invariant "; + if (Op.getFlags() & MachineMemOperand::MOTargetFlag1) + OS << '"' << getTargetMMOFlagName(TII, MachineMemOperand::MOTargetFlag1) + << "\" "; + if (Op.getFlags() & MachineMemOperand::MOTargetFlag2) + OS << '"' << getTargetMMOFlagName(TII, MachineMemOperand::MOTargetFlag2) + << "\" "; + if (Op.getFlags() & MachineMemOperand::MOTargetFlag3) + OS << '"' << getTargetMMOFlagName(TII, MachineMemOperand::MOTargetFlag3) + << "\" "; if (Op.isLoad()) OS << "load "; else { assert(Op.isStore() && "Non load machine operand must be a store"); OS << "store "; } + + printSyncScope(Context, Op.getSyncScopeID()); + + if (Op.getOrdering() != AtomicOrdering::NotAtomic) + OS << toIRString(Op.getOrdering()) << ' '; + if (Op.getFailureOrdering() != AtomicOrdering::NotAtomic) + OS << toIRString(Op.getFailureOrdering()) << ' '; + OS << Op.getSize(); if (const Value *Val = Op.getValue()) { OS << (Op.isLoad() ? " from " : " into "); @@ -988,6 +1144,23 @@ void MIPrinter::print(const MachineMemOperand &Op) { OS << ')'; } +void MIPrinter::printSyncScope(const LLVMContext &Context, SyncScope::ID SSID) { + switch (SSID) { + case SyncScope::System: { + break; + } + default: { + if (SSNs.empty()) + Context.getSyncScopeNames(SSNs); + + OS << "syncscope(\""; + PrintEscapedString(SSNs[SSID], OS); + OS << "\") "; + break; + } + } +} + static void printCFIRegister(unsigned DwarfReg, raw_ostream &OS, const TargetRegisterInfo *TRI) { int Reg = TRI->getLLVMRegNum(DwarfReg, true); diff --git a/contrib/llvm/lib/CodeGen/MIRPrinter.h b/contrib/llvm/lib/CodeGen/MIRPrinter.h deleted file mode 100644 index 16aa903..0000000 --- a/contrib/llvm/lib/CodeGen/MIRPrinter.h +++ /dev/null @@ -1,33 +0,0 @@ -//===- MIRPrinter.h - MIR serialization format printer --------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file declares the functions that print out the LLVM IR and the machine -// functions using the MIR serialization format. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_CODEGEN_MIRPRINTER_H -#define LLVM_LIB_CODEGEN_MIRPRINTER_H - -namespace llvm { - -class MachineFunction; -class Module; -class raw_ostream; - -/// Print LLVM IR using the MIR serialization format to the given output stream. -void printMIR(raw_ostream &OS, const Module &M); - -/// Print a machine function using the MIR serialization format to the given -/// output stream. -void printMIR(raw_ostream &OS, const MachineFunction &MF); - -} // end namespace llvm - -#endif diff --git a/contrib/llvm/lib/CodeGen/MIRPrintingPass.cpp b/contrib/llvm/lib/CodeGen/MIRPrintingPass.cpp index c690bcf..09354cf 100644 --- a/contrib/llvm/lib/CodeGen/MIRPrintingPass.cpp +++ b/contrib/llvm/lib/CodeGen/MIRPrintingPass.cpp @@ -12,10 +12,11 @@ // //===----------------------------------------------------------------------===// -#include "MIRPrinter.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MIRPrinter.h" + #include "llvm/CodeGen/MIRYamlMapping.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" diff --git a/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp b/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp index 3869f97..81597af 100644 --- a/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp +++ b/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/ModuleSlotTracker.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" @@ -148,8 +149,11 @@ MachineBasicBlock::iterator MachineBasicBlock::getFirstNonPHI() { MachineBasicBlock::iterator MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) { + const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo(); + iterator E = end(); - while (I != E && (I->isPHI() || I->isPosition())) + while (I != E && (I->isPHI() || I->isPosition() || + TII->isBasicBlockPrologue(*I))) ++I; // FIXME: This needs to change if we wish to bundle labels // inside the bundle. @@ -160,8 +164,11 @@ MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) { MachineBasicBlock::iterator MachineBasicBlock::SkipPHIsLabelsAndDebug(MachineBasicBlock::iterator I) { + const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo(); + iterator E = end(); - while (I != E && (I->isPHI() || I->isPosition() || I->isDebugValue())) + while (I != E && (I->isPHI() || I->isPosition() || I->isDebugValue() || + TII->isBasicBlockPrologue(*I))) ++I; // FIXME: This needs to change if we wish to bundle labels / dbg_values // inside the bundle. @@ -221,11 +228,17 @@ LLVM_DUMP_METHOD void MachineBasicBlock::dump() const { } #endif +bool MachineBasicBlock::isLegalToHoistInto() const { + if (isReturnBlock() || hasEHPadSuccessor()) + return false; + return true; +} + StringRef MachineBasicBlock::getName() const { if (const BasicBlock *LBB = getBasicBlock()) return LBB->getName(); else - return "(null)"; + return StringRef("", 0); } /// Return a hopefully unique identifier for this block. @@ -343,6 +356,13 @@ void MachineBasicBlock::removeLiveIn(MCPhysReg Reg, LaneBitmask LaneMask) { LiveIns.erase(I); } +MachineBasicBlock::livein_iterator +MachineBasicBlock::removeLiveIn(MachineBasicBlock::livein_iterator I) { + // Get non-const version of iterator. + LiveInVector::iterator LI = LiveIns.begin() + (I - LiveIns.begin()); + return LiveIns.erase(LI); +} + bool MachineBasicBlock::isLiveIn(MCPhysReg Reg, LaneBitmask LaneMask) const { livein_iterator I = find_if( LiveIns, [Reg](const RegisterMaskPair &LI) { return LI.PhysReg == Reg; }); @@ -417,7 +437,7 @@ void MachineBasicBlock::updateTerminator() { MachineBasicBlock *TBB = nullptr, *FBB = nullptr; SmallVector<MachineOperand, 4> Cond; - DebugLoc DL; // FIXME: this is nowhere + DebugLoc DL = findBranchDebugLoc(); bool B = TII->analyzeBranch(*this, TBB, FBB, Cond); (void) B; assert(!B && "UpdateTerminators requires analyzable predecessors!"); @@ -485,7 +505,7 @@ void MachineBasicBlock::updateTerminator() { // FIXME: This does not seem like a reasonable pattern to support, but it // has been seen in the wild coming out of degenerate ARM test cases. TII->removeBranch(*this); - + // Finally update the unconditional successor to be reached via a branch if // it would not be reached by fallthrough. if (!isLayoutSuccessor(TBB)) @@ -681,16 +701,16 @@ bool MachineBasicBlock::isLayoutSuccessor(const MachineBasicBlock *MBB) const { return std::next(I) == MachineFunction::const_iterator(MBB); } -bool MachineBasicBlock::canFallThrough() { +MachineBasicBlock *MachineBasicBlock::getFallThrough() { MachineFunction::iterator Fallthrough = getIterator(); ++Fallthrough; // If FallthroughBlock is off the end of the function, it can't fall through. if (Fallthrough == getParent()->end()) - return false; + return nullptr; // If FallthroughBlock isn't a successor, no fallthrough is possible. if (!isSuccessor(&*Fallthrough)) - return false; + return nullptr; // Analyze the branches, if any, at the end of the block. MachineBasicBlock *TBB = nullptr, *FBB = nullptr; @@ -702,25 +722,31 @@ bool MachineBasicBlock::canFallThrough() { // is possible. The isPredicated check is needed because this code can be // called during IfConversion, where an instruction which is normally a // Barrier is predicated and thus no longer an actual control barrier. - return empty() || !back().isBarrier() || TII->isPredicated(back()); + return (empty() || !back().isBarrier() || TII->isPredicated(back())) + ? &*Fallthrough + : nullptr; } // If there is no branch, control always falls through. - if (!TBB) return true; + if (!TBB) return &*Fallthrough; // If there is some explicit branch to the fallthrough block, it can obviously // reach, even though the branch should get folded to fall through implicitly. if (MachineFunction::iterator(TBB) == Fallthrough || MachineFunction::iterator(FBB) == Fallthrough) - return true; + return &*Fallthrough; // If it's an unconditional branch to some block not the fall through, it // doesn't fall through. - if (Cond.empty()) return false; + if (Cond.empty()) return nullptr; // Otherwise, if it is conditional and has no explicit false block, it falls // through. - return FBB == nullptr; + return (FBB == nullptr) ? &*Fallthrough : nullptr; +} + +bool MachineBasicBlock::canFallThrough() { + return getFallThrough() != nullptr; } MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, @@ -1144,6 +1170,24 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) { return {}; } +/// Find and return the merged DebugLoc of the branch instructions of the block. +/// Return UnknownLoc if there is none. +DebugLoc +MachineBasicBlock::findBranchDebugLoc() { + DebugLoc DL; + auto TI = getFirstTerminator(); + while (TI != end() && !TI->isBranch()) + ++TI; + + if (TI != end()) { + DL = TI->getDebugLoc(); + for (++TI ; TI != end() ; ++TI) + if (TI->isBranch()) + DL = DILocation::getMergedLocation(DL, TI->getDebugLoc()); + } + return DL; +} + /// Return probability of the edge from this block to MBB. BranchProbability MachineBasicBlock::getSuccProbability(const_succ_iterator Succ) const { diff --git a/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp index 7d5124d..4d1ec11 100644 --- a/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp +++ b/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp @@ -26,9 +26,8 @@ using namespace llvm; -#define DEBUG_TYPE "block-freq" +#define DEBUG_TYPE "machine-block-freq" -#ifndef NDEBUG static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG( "view-machine-block-freq-propagation-dags", cl::Hidden, @@ -43,10 +42,37 @@ static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG( "integer fractional block frequency representation."), clEnumValN(GVDT_Count, "count", "display a graph using the real " "profile count if available."))); +// Similar option above, but used to control BFI display only after MBP pass +cl::opt<GVDAGType> ViewBlockLayoutWithBFI( + "view-block-layout-with-bfi", cl::Hidden, + cl::desc( + "Pop up a window to show a dag displaying MBP layout and associated " + "block frequencies of the CFG."), + cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."), + clEnumValN(GVDT_Fraction, "fraction", + "display a graph using the " + "fractional block frequency representation."), + clEnumValN(GVDT_Integer, "integer", + "display a graph using the raw " + "integer fractional block frequency representation."), + clEnumValN(GVDT_Count, "count", + "display a graph using the real " + "profile count if available."))); +// Command line option to specify the name of the function for CFG dump +// Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name= extern cl::opt<std::string> ViewBlockFreqFuncName; +// Command line option to specify hot frequency threshold. +// Defined in Analysis/BlockFrequencyInfo.cpp: -view-hot-freq-perc= extern cl::opt<unsigned> ViewHotFreqPercent; +static GVDAGType getGVDT() { + if (ViewBlockLayoutWithBFI != GVDT_None) + return ViewBlockLayoutWithBFI; + + return ViewMachineBlockFreqPropagationDAG; +} + namespace llvm { template <> struct GraphTraits<MachineBlockFrequencyInfo *> { @@ -80,12 +106,32 @@ template <> struct DOTGraphTraits<MachineBlockFrequencyInfo *> : public MBFIDOTGraphTraitsBase { explicit DOTGraphTraits(bool isSimple = false) - : MBFIDOTGraphTraitsBase(isSimple) {} + : MBFIDOTGraphTraitsBase(isSimple), CurFunc(nullptr), LayoutOrderMap() {} + + const MachineFunction *CurFunc; + DenseMap<const MachineBasicBlock *, int> LayoutOrderMap; std::string getNodeLabel(const MachineBasicBlock *Node, const MachineBlockFrequencyInfo *Graph) { - return MBFIDOTGraphTraitsBase::getNodeLabel( - Node, Graph, ViewMachineBlockFreqPropagationDAG); + + int layout_order = -1; + // Attach additional ordering information if 'isSimple' is false. + if (!isSimple()) { + const MachineFunction *F = Node->getParent(); + if (!CurFunc || F != CurFunc) { + if (CurFunc) + LayoutOrderMap.clear(); + + CurFunc = F; + int O = 0; + for (auto MBI = F->begin(); MBI != F->end(); ++MBI, ++O) { + LayoutOrderMap[&*MBI] = O; + } + } + layout_order = LayoutOrderMap[Node]; + } + return MBFIDOTGraphTraitsBase::getNodeLabel(Node, Graph, getGVDT(), + layout_order); } std::string getNodeAttributes(const MachineBasicBlock *Node, @@ -102,13 +148,12 @@ struct DOTGraphTraits<MachineBlockFrequencyInfo *> }; } // end namespace llvm -#endif -INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, "machine-block-freq", +INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, DEBUG_TYPE, "Machine Block Frequency Analysis", true, true) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(MachineBlockFrequencyInfo, "machine-block-freq", +INITIALIZE_PASS_END(MachineBlockFrequencyInfo, DEBUG_TYPE, "Machine Block Frequency Analysis", true, true) char MachineBlockFrequencyInfo::ID = 0; @@ -127,20 +172,24 @@ void MachineBlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } -bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) { - MachineBranchProbabilityInfo &MBPI = - getAnalysis<MachineBranchProbabilityInfo>(); - MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); +void MachineBlockFrequencyInfo::calculate( + const MachineFunction &F, const MachineBranchProbabilityInfo &MBPI, + const MachineLoopInfo &MLI) { if (!MBFI) MBFI.reset(new ImplType); MBFI->calculate(F, MBPI, MLI); -#ifndef NDEBUG if (ViewMachineBlockFreqPropagationDAG != GVDT_None && (ViewBlockFreqFuncName.empty() || F.getName().equals(ViewBlockFreqFuncName))) { - view(); + view("MachineBlockFrequencyDAGS." + F.getName()); } -#endif +} + +bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) { + MachineBranchProbabilityInfo &MBPI = + getAnalysis<MachineBranchProbabilityInfo>(); + MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); + calculate(F, MBPI, MLI); return false; } @@ -148,15 +197,9 @@ void MachineBlockFrequencyInfo::releaseMemory() { MBFI.reset(); } /// Pop up a ghostview window with the current block frequency propagation /// rendered using dot. -void MachineBlockFrequencyInfo::view() const { -// This code is only for debugging. -#ifndef NDEBUG - ViewGraph(const_cast<MachineBlockFrequencyInfo *>(this), - "MachineBlockFrequencyDAGs"); -#else - errs() << "MachineBlockFrequencyInfo::view is only available in debug builds " - "on systems with Graphviz or gv!\n"; -#endif // NDEBUG +void MachineBlockFrequencyInfo::view(const Twine &Name, bool isSimple) const { + // This code is only for debugging. + ViewGraph(const_cast<MachineBlockFrequencyInfo *>(this), Name, isSimple); } BlockFrequency diff --git a/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp index 40e3840..447ad62 100644 --- a/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -25,22 +25,23 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/TargetPassConfig.h" #include "BranchFolding.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/BlockFrequencyInfoImpl.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" -#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TailDuplicator.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -49,6 +50,8 @@ #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <algorithm> +#include <functional> +#include <utility> using namespace llvm; #define DEBUG_TYPE "block-placement" @@ -82,19 +85,6 @@ static cl::opt<unsigned> ExitBlockBias( // Definition: // - Outlining: placement of a basic block outside the chain or hot path. -static cl::opt<bool> OutlineOptionalBranches( - "outline-optional-branches", - cl::desc("Outlining optional branches will place blocks that are optional " - "branches, i.e. branches with a common post dominator, outside " - "the hot path or chain"), - cl::init(false), cl::Hidden); - -static cl::opt<unsigned> OutlineOptionalThreshold( - "outline-optional-threshold", - cl::desc("Don't outline optional branches that are a single block with an " - "instruction count below this threshold"), - cl::init(4), cl::Hidden); - static cl::opt<unsigned> LoopToColdBlockRatio( "loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " @@ -136,20 +126,55 @@ BranchFoldPlacement("branch-fold-placement", cl::init(true), cl::Hidden); // Heuristic for tail duplication. -static cl::opt<unsigned> TailDuplicatePlacementThreshold( +static cl::opt<unsigned> TailDupPlacementThreshold( "tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden); +// Heuristic for aggressive tail duplication. +static cl::opt<unsigned> TailDupPlacementAggressiveThreshold( + "tail-dup-placement-aggressive-threshold", + cl::desc("Instruction cutoff for aggressive tail duplication during " + "layout. Used at -O3. Tail merging during layout is forced to " + "have a threshold that won't conflict."), cl::init(3), + cl::Hidden); + +// Heuristic for tail duplication. +static cl::opt<unsigned> TailDupPlacementPenalty( + "tail-dup-placement-penalty", + cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " + "Copying can increase fallthrough, but it also increases icache " + "pressure. This parameter controls the penalty to account for that. " + "Percent as integer."), + cl::init(2), + cl::Hidden); + +// Heuristic for triangle chains. +static cl::opt<unsigned> TriangleChainCount( + "triangle-chain-count", + cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " + "triangle tail duplication heuristic to kick in. 0 to disable."), + cl::init(2), + cl::Hidden); + extern cl::opt<unsigned> StaticLikelyProb; extern cl::opt<unsigned> ProfileLikelyProb; +// Internal option used to control BFI display only after MBP pass. +// Defined in CodeGen/MachineBlockFrequencyInfo.cpp: +// -view-block-layout-with-bfi= +extern cl::opt<GVDAGType> ViewBlockLayoutWithBFI; + +// Command line option to specify the name of the function for CFG dump +// Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name= +extern cl::opt<std::string> ViewBlockFreqFuncName; + namespace { class BlockChain; /// \brief Type for our function-wide basic block -> block chain mapping. -typedef DenseMap<MachineBasicBlock *, BlockChain *> BlockToChainMapType; +typedef DenseMap<const MachineBasicBlock *, BlockChain *> BlockToChainMapType; } namespace { @@ -193,12 +218,15 @@ public: /// \brief Iterator over blocks within the chain. typedef SmallVectorImpl<MachineBasicBlock *>::iterator iterator; + typedef SmallVectorImpl<MachineBasicBlock *>::const_iterator const_iterator; /// \brief Beginning of blocks within the chain. iterator begin() { return Blocks.begin(); } + const_iterator begin() const { return Blocks.begin(); } /// \brief End of blocks within the chain. iterator end() { return Blocks.end(); } + const_iterator end() const { return Blocks.end(); } bool remove(MachineBasicBlock* BB) { for(iterator i = begin(); i != end(); ++i) { @@ -217,25 +245,26 @@ public: /// updating the block -> chain mapping. It does not free or tear down the /// old chain, but the old chain's block list is no longer valid. void merge(MachineBasicBlock *BB, BlockChain *Chain) { - assert(BB); - assert(!Blocks.empty()); + assert(BB && "Can't merge a null block."); + assert(!Blocks.empty() && "Can't merge into an empty chain."); // Fast path in case we don't have a chain already. if (!Chain) { - assert(!BlockToChain[BB]); + assert(!BlockToChain[BB] && + "Passed chain is null, but BB has entry in BlockToChain."); Blocks.push_back(BB); BlockToChain[BB] = this; return; } - assert(BB == *Chain->begin()); + assert(BB == *Chain->begin() && "Passed BB is not head of Chain."); assert(Chain->begin() != Chain->end()); // Update the incoming blocks to point to this chain, and add them to the // chain structure. for (MachineBasicBlock *ChainBB : *Chain) { Blocks.push_back(ChainBB); - assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain"); + assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain."); BlockToChain[ChainBB] = this; } } @@ -264,12 +293,28 @@ public: namespace { class MachineBlockPlacement : public MachineFunctionPass { /// \brief A typedef for a block filter set. - typedef SmallSetVector<MachineBasicBlock *, 16> BlockFilterSet; + typedef SmallSetVector<const MachineBasicBlock *, 16> BlockFilterSet; + + /// Pair struct containing basic block and taildup profitiability + struct BlockAndTailDupResult { + MachineBasicBlock *BB; + bool ShouldTailDup; + }; + + /// Triple struct containing edge weight and the edge. + struct WeightedEdge { + BlockFrequency Weight; + MachineBasicBlock *Src; + MachineBasicBlock *Dest; + }; /// \brief work lists of blocks that are ready to be laid out SmallVector<MachineBasicBlock *, 16> BlockWorkList; SmallVector<MachineBasicBlock *, 16> EHPadWorkList; + /// Edges that have already been computed as optimal. + DenseMap<const MachineBasicBlock *, BlockAndTailDupResult> ComputedEdges; + /// \brief Machine Function MachineFunction *F; @@ -294,7 +339,7 @@ class MachineBlockPlacement : public MachineFunctionPass { const TargetLoweringBase *TLI; /// \brief A handle to the post dominator tree. - MachineDominatorTree *MDT; + MachinePostDominatorTree *MPDT; /// \brief Duplicator used to duplicate tails during placement. /// @@ -303,10 +348,6 @@ class MachineBlockPlacement : public MachineFunctionPass { /// must be done inline. TailDuplicator TailDup; - /// \brief A set of blocks that are unavoidably execute, i.e. they dominate - /// all terminators of the MachineFunction. - SmallPtrSet<MachineBasicBlock *, 4> UnavoidableBlocks; - /// \brief Allocator and owner of BlockChain structures. /// /// We build BlockChains lazily while processing the loop structure of @@ -322,7 +363,7 @@ class MachineBlockPlacement : public MachineFunctionPass { /// BlockChain it participates in, if any. We use it to, among other things, /// allow implicitly defining edges between chains as the existing edges /// between basic blocks. - DenseMap<MachineBasicBlock *, BlockChain *> BlockToChain; + DenseMap<const MachineBasicBlock *, BlockChain *> BlockToChain; #ifndef NDEBUG /// The set of basic blocks that have terminators that cannot be fully @@ -334,75 +375,107 @@ class MachineBlockPlacement : public MachineFunctionPass { /// Decrease the UnscheduledPredecessors count for all blocks in chain, and /// if the count goes to 0, add them to the appropriate work list. - void markChainSuccessors(BlockChain &Chain, MachineBasicBlock *LoopHeaderBB, - const BlockFilterSet *BlockFilter = nullptr); + void markChainSuccessors( + const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB, + const BlockFilterSet *BlockFilter = nullptr); /// Decrease the UnscheduledPredecessors count for a single block, and /// if the count goes to 0, add them to the appropriate work list. void markBlockSuccessors( - BlockChain &Chain, MachineBasicBlock *BB, MachineBasicBlock *LoopHeaderBB, + const BlockChain &Chain, const MachineBasicBlock *BB, + const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter = nullptr); - BranchProbability - collectViableSuccessors(MachineBasicBlock *BB, BlockChain &Chain, - const BlockFilterSet *BlockFilter, - SmallVector<MachineBasicBlock *, 4> &Successors); - bool shouldPredBlockBeOutlined(MachineBasicBlock *BB, MachineBasicBlock *Succ, - BlockChain &Chain, - const BlockFilterSet *BlockFilter, - BranchProbability SuccProb, - BranchProbability HotProb); + collectViableSuccessors( + const MachineBasicBlock *BB, const BlockChain &Chain, + const BlockFilterSet *BlockFilter, + SmallVector<MachineBasicBlock *, 4> &Successors); + bool shouldPredBlockBeOutlined( + const MachineBasicBlock *BB, const MachineBasicBlock *Succ, + const BlockChain &Chain, const BlockFilterSet *BlockFilter, + BranchProbability SuccProb, BranchProbability HotProb); bool repeatedlyTailDuplicateBlock( MachineBasicBlock *BB, MachineBasicBlock *&LPred, - MachineBasicBlock *LoopHeaderBB, + const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain, BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt); - bool maybeTailDuplicateBlock(MachineBasicBlock *BB, MachineBasicBlock *LPred, - const BlockChain &Chain, - BlockFilterSet *BlockFilter, - MachineFunction::iterator &PrevUnplacedBlockIt, - bool &DuplicatedToPred); - bool - hasBetterLayoutPredecessor(MachineBasicBlock *BB, MachineBasicBlock *Succ, - BlockChain &SuccChain, BranchProbability SuccProb, - BranchProbability RealSuccProb, BlockChain &Chain, - const BlockFilterSet *BlockFilter); - MachineBasicBlock *selectBestSuccessor(MachineBasicBlock *BB, - BlockChain &Chain, - const BlockFilterSet *BlockFilter); - MachineBasicBlock * - selectBestCandidateBlock(BlockChain &Chain, - SmallVectorImpl<MachineBasicBlock *> &WorkList); - MachineBasicBlock * - getFirstUnplacedBlock(const BlockChain &PlacedChain, - MachineFunction::iterator &PrevUnplacedBlockIt, - const BlockFilterSet *BlockFilter); + bool maybeTailDuplicateBlock( + MachineBasicBlock *BB, MachineBasicBlock *LPred, + BlockChain &Chain, BlockFilterSet *BlockFilter, + MachineFunction::iterator &PrevUnplacedBlockIt, + bool &DuplicatedToPred); + bool hasBetterLayoutPredecessor( + const MachineBasicBlock *BB, const MachineBasicBlock *Succ, + const BlockChain &SuccChain, BranchProbability SuccProb, + BranchProbability RealSuccProb, const BlockChain &Chain, + const BlockFilterSet *BlockFilter); + BlockAndTailDupResult selectBestSuccessor( + const MachineBasicBlock *BB, const BlockChain &Chain, + const BlockFilterSet *BlockFilter); + MachineBasicBlock *selectBestCandidateBlock( + const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList); + MachineBasicBlock *getFirstUnplacedBlock( + const BlockChain &PlacedChain, + MachineFunction::iterator &PrevUnplacedBlockIt, + const BlockFilterSet *BlockFilter); /// \brief Add a basic block to the work list if it is appropriate. /// /// If the optional parameter BlockFilter is provided, only MBB /// present in the set will be added to the worklist. If nullptr /// is provided, no filtering occurs. - void fillWorkLists(MachineBasicBlock *MBB, + void fillWorkLists(const MachineBasicBlock *MBB, SmallPtrSetImpl<BlockChain *> &UpdatedPreds, const BlockFilterSet *BlockFilter); - void buildChain(MachineBasicBlock *BB, BlockChain &Chain, + void buildChain(const MachineBasicBlock *BB, BlockChain &Chain, BlockFilterSet *BlockFilter = nullptr); - MachineBasicBlock *findBestLoopTop(MachineLoop &L, - const BlockFilterSet &LoopBlockSet); - MachineBasicBlock *findBestLoopExit(MachineLoop &L, - const BlockFilterSet &LoopBlockSet); - BlockFilterSet collectLoopBlockSet(MachineLoop &L); - void buildLoopChains(MachineLoop &L); - void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB, - const BlockFilterSet &LoopBlockSet); - void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L, - const BlockFilterSet &LoopBlockSet); - void collectMustExecuteBBs(); + MachineBasicBlock *findBestLoopTop( + const MachineLoop &L, const BlockFilterSet &LoopBlockSet); + MachineBasicBlock *findBestLoopExit( + const MachineLoop &L, const BlockFilterSet &LoopBlockSet); + BlockFilterSet collectLoopBlockSet(const MachineLoop &L); + void buildLoopChains(const MachineLoop &L); + void rotateLoop( + BlockChain &LoopChain, const MachineBasicBlock *ExitingBB, + const BlockFilterSet &LoopBlockSet); + void rotateLoopWithProfile( + BlockChain &LoopChain, const MachineLoop &L, + const BlockFilterSet &LoopBlockSet); void buildCFGChains(); void optimizeBranches(); void alignBlocks(); + /// Returns true if a block should be tail-duplicated to increase fallthrough + /// opportunities. + bool shouldTailDuplicate(MachineBasicBlock *BB); + /// Check the edge frequencies to see if tail duplication will increase + /// fallthroughs. + bool isProfitableToTailDup( + const MachineBasicBlock *BB, const MachineBasicBlock *Succ, + BranchProbability AdjustedSumProb, + const BlockChain &Chain, const BlockFilterSet *BlockFilter); + /// Check for a trellis layout. + bool isTrellis(const MachineBasicBlock *BB, + const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs, + const BlockChain &Chain, const BlockFilterSet *BlockFilter); + /// Get the best successor given a trellis layout. + BlockAndTailDupResult getBestTrellisSuccessor( + const MachineBasicBlock *BB, + const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs, + BranchProbability AdjustedSumProb, const BlockChain &Chain, + const BlockFilterSet *BlockFilter); + /// Get the best pair of non-conflicting edges. + static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges( + const MachineBasicBlock *BB, + MutableArrayRef<SmallVector<WeightedEdge, 8>> Edges); + /// Returns true if a block can tail duplicate into all unplaced + /// predecessors. Filters based on loop. + bool canTailDuplicateUnplacedPreds( + const MachineBasicBlock *BB, MachineBasicBlock *Succ, + const BlockChain &Chain, const BlockFilterSet *BlockFilter); + /// Find chains of triangles to tail-duplicate where a global analysis works, + /// but a local analysis would not find them. + void precomputeTriangleChains(); public: static char ID; // Pass identification, replacement for typeid @@ -415,7 +488,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<MachineBranchProbabilityInfo>(); AU.addRequired<MachineBlockFrequencyInfo>(); - AU.addRequired<MachineDominatorTree>(); + if (TailDupPlacement) + AU.addRequired<MachinePostDominatorTree>(); AU.addRequired<MachineLoopInfo>(); AU.addRequired<TargetPassConfig>(); MachineFunctionPass::getAnalysisUsage(AU); @@ -425,20 +499,20 @@ public: char MachineBlockPlacement::ID = 0; char &llvm::MachineBlockPlacementID = MachineBlockPlacement::ID; -INITIALIZE_PASS_BEGIN(MachineBlockPlacement, "block-placement", +INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE, "Branch Probability Basic Block Placement", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement", +INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE, "Branch Probability Basic Block Placement", false, false) #ifndef NDEBUG /// \brief Helper to print the name of a MBB. /// /// Only used by debug logging. -static std::string getBlockName(MachineBasicBlock *BB) { +static std::string getBlockName(const MachineBasicBlock *BB) { std::string Result; raw_string_ostream OS(Result); OS << "BB#" << BB->getNumber(); @@ -455,7 +529,7 @@ static std::string getBlockName(MachineBasicBlock *BB) { /// having one fewer active predecessor. It also adds any successors of this /// chain which reach the zero-predecessor state to the appropriate worklist. void MachineBlockPlacement::markChainSuccessors( - BlockChain &Chain, MachineBasicBlock *LoopHeaderBB, + const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) { // Walk all the blocks in this chain, marking their successors as having // a predecessor placed. @@ -471,8 +545,8 @@ void MachineBlockPlacement::markChainSuccessors( /// and was duplicated into the chain end, we need to redo markBlockSuccessors /// for just that block. void MachineBlockPlacement::markBlockSuccessors( - BlockChain &Chain, MachineBasicBlock *MBB, MachineBasicBlock *LoopHeaderBB, - const BlockFilterSet *BlockFilter) { + const BlockChain &Chain, const MachineBasicBlock *MBB, + const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) { // Add any successors for which this is the only un-placed in-loop // predecessor to the worklist as a viable candidate for CFG-neutral // placement. No subsequent placement of this block will violate the CFG @@ -504,7 +578,8 @@ void MachineBlockPlacement::markBlockSuccessors( /// the total branch probability of edges from \p BB to those /// blocks. BranchProbability MachineBlockPlacement::collectViableSuccessors( - MachineBasicBlock *BB, BlockChain &Chain, const BlockFilterSet *BlockFilter, + const MachineBasicBlock *BB, const BlockChain &Chain, + const BlockFilterSet *BlockFilter, SmallVector<MachineBasicBlock *, 4> &Successors) { // Adjust edge probabilities by excluding edges pointing to blocks that is // either not in BlockFilter or is already in the current chain. Consider the @@ -519,8 +594,8 @@ BranchProbability MachineBlockPlacement::collectViableSuccessors( // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after // A->C is chosen as a fall-through, D won't be selected as a successor of C // due to CFG constraint (the probability of C->D is not greater than - // HotProb to break top-order). If we exclude E that is not in BlockFilter - // when calculating the probability of C->D, D will be selected and we + // HotProb to break topo-order). If we exclude E that is not in BlockFilter + // when calculating the probability of C->D, D will be selected and we // will get A C D B as the layout of this loop. auto AdjustedSumProb = BranchProbability::getOne(); for (MachineBasicBlock *Succ : BB->successors()) { @@ -561,46 +636,573 @@ getAdjustedProbability(BranchProbability OrigProb, return SuccProb; } -/// When the option OutlineOptionalBranches is on, this method -/// checks if the fallthrough candidate block \p Succ (of block -/// \p BB) also has other unscheduled predecessor blocks which -/// are also successors of \p BB (forming triangular shape CFG). -/// If none of such predecessors are small, it returns true. -/// The caller can choose to select \p Succ as the layout successors -/// so that \p Succ's predecessors (optional branches) can be -/// outlined. -/// FIXME: fold this with more general layout cost analysis. -bool MachineBlockPlacement::shouldPredBlockBeOutlined( - MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain, - const BlockFilterSet *BlockFilter, BranchProbability SuccProb, - BranchProbability HotProb) { - if (!OutlineOptionalBranches) +/// Check if \p BB has exactly the successors in \p Successors. +static bool +hasSameSuccessors(MachineBasicBlock &BB, + SmallPtrSetImpl<const MachineBasicBlock *> &Successors) { + if (BB.succ_size() != Successors.size()) + return false; + // We don't want to count self-loops + if (Successors.count(&BB)) return false; - // If we outline optional branches, look whether Succ is unavoidable, i.e. - // dominates all terminators of the MachineFunction. If it does, other - // successors must be optional. Don't do this for cold branches. - if (SuccProb > HotProb.getCompl() && UnavoidableBlocks.count(Succ) > 0) { - for (MachineBasicBlock *Pred : Succ->predecessors()) { - // Check whether there is an unplaced optional branch. - if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) || - BlockToChain[Pred] == &Chain) + for (MachineBasicBlock *Succ : BB.successors()) + if (!Successors.count(Succ)) + return false; + return true; +} + +/// Check if a block should be tail duplicated to increase fallthrough +/// opportunities. +/// \p BB Block to check. +bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) { + // Blocks with single successors don't create additional fallthrough + // opportunities. Don't duplicate them. TODO: When conditional exits are + // analyzable, allow them to be duplicated. + bool IsSimple = TailDup.isSimpleBB(BB); + + if (BB->succ_size() == 1) + return false; + return TailDup.shouldTailDuplicate(IsSimple, *BB); +} + +/// Compare 2 BlockFrequency's with a small penalty for \p A. +/// In order to be conservative, we apply a X% penalty to account for +/// increased icache pressure and static heuristics. For small frequencies +/// we use only the numerators to improve accuracy. For simplicity, we assume the +/// penalty is less than 100% +/// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere. +static bool greaterWithBias(BlockFrequency A, BlockFrequency B, + uint64_t EntryFreq) { + BranchProbability ThresholdProb(TailDupPlacementPenalty, 100); + BlockFrequency Gain = A - B; + return (Gain / ThresholdProb).getFrequency() >= EntryFreq; +} + +/// Check the edge frequencies to see if tail duplication will increase +/// fallthroughs. It only makes sense to call this function when +/// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is +/// always locally profitable if we would have picked \p Succ without +/// considering duplication. +bool MachineBlockPlacement::isProfitableToTailDup( + const MachineBasicBlock *BB, const MachineBasicBlock *Succ, + BranchProbability QProb, + const BlockChain &Chain, const BlockFilterSet *BlockFilter) { + // We need to do a probability calculation to make sure this is profitable. + // First: does succ have a successor that post-dominates? This affects the + // calculation. The 2 relevant cases are: + // BB BB + // | \Qout | \Qout + // P| C |P C + // = C' = C' + // | /Qin | /Qin + // | / | / + // Succ Succ + // / \ | \ V + // U/ =V |U \ + // / \ = D + // D E | / + // | / + // |/ + // PDom + // '=' : Branch taken for that CFG edge + // In the second case, Placing Succ while duplicating it into C prevents the + // fallthrough of Succ into either D or PDom, because they now have C as an + // unplaced predecessor + + // Start by figuring out which case we fall into + MachineBasicBlock *PDom = nullptr; + SmallVector<MachineBasicBlock *, 4> SuccSuccs; + // Only scan the relevant successors + auto AdjustedSuccSumProb = + collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs); + BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ); + auto BBFreq = MBFI->getBlockFreq(BB); + auto SuccFreq = MBFI->getBlockFreq(Succ); + BlockFrequency P = BBFreq * PProb; + BlockFrequency Qout = BBFreq * QProb; + uint64_t EntryFreq = MBFI->getEntryFreq(); + // If there are no more successors, it is profitable to copy, as it strictly + // increases fallthrough. + if (SuccSuccs.size() == 0) + return greaterWithBias(P, Qout, EntryFreq); + + auto BestSuccSucc = BranchProbability::getZero(); + // Find the PDom or the best Succ if no PDom exists. + for (MachineBasicBlock *SuccSucc : SuccSuccs) { + auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc); + if (Prob > BestSuccSucc) + BestSuccSucc = Prob; + if (PDom == nullptr) + if (MPDT->dominates(SuccSucc, Succ)) { + PDom = SuccSucc; + break; + } + } + // For the comparisons, we need to know Succ's best incoming edge that isn't + // from BB. + auto SuccBestPred = BlockFrequency(0); + for (MachineBasicBlock *SuccPred : Succ->predecessors()) { + if (SuccPred == Succ || SuccPred == BB + || BlockToChain[SuccPred] == &Chain + || (BlockFilter && !BlockFilter->count(SuccPred))) + continue; + auto Freq = MBFI->getBlockFreq(SuccPred) + * MBPI->getEdgeProbability(SuccPred, Succ); + if (Freq > SuccBestPred) + SuccBestPred = Freq; + } + // Qin is Succ's best unplaced incoming edge that isn't BB + BlockFrequency Qin = SuccBestPred; + // If it doesn't have a post-dominating successor, here is the calculation: + // BB BB + // | \Qout | \ + // P| C | = + // = C' | C + // | /Qin | | + // | / | C' (+Succ) + // Succ Succ /| + // / \ | \/ | + // U/ =V | == | + // / \ | / \| + // D E D E + // '=' : Branch taken for that CFG edge + // Cost in the first case is: P + V + // For this calculation, we always assume P > Qout. If Qout > P + // The result of this function will be ignored at the caller. + // Let F = SuccFreq - Qin + // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V + + if (PDom == nullptr || !Succ->isSuccessor(PDom)) { + BranchProbability UProb = BestSuccSucc; + BranchProbability VProb = AdjustedSuccSumProb - UProb; + BlockFrequency F = SuccFreq - Qin; + BlockFrequency V = SuccFreq * VProb; + BlockFrequency QinU = std::min(Qin, F) * UProb; + BlockFrequency BaseCost = P + V; + BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb; + return greaterWithBias(BaseCost, DupCost, EntryFreq); + } + BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom); + BranchProbability VProb = AdjustedSuccSumProb - UProb; + BlockFrequency U = SuccFreq * UProb; + BlockFrequency V = SuccFreq * VProb; + BlockFrequency F = SuccFreq - Qin; + // If there is a post-dominating successor, here is the calculation: + // BB BB BB BB + // | \Qout | \ | \Qout | \ + // |P C | = |P C | = + // = C' |P C = C' |P C + // | /Qin | | | /Qin | | + // | / | C' (+Succ) | / | C' (+Succ) + // Succ Succ /| Succ Succ /| + // | \ V | \/ | | \ V | \/ | + // |U \ |U /\ =? |U = |U /\ | + // = D = = =?| | D | = =| + // | / |/ D | / |/ D + // | / | / | = | / + // |/ | / |/ | = + // Dom Dom Dom Dom + // '=' : Branch taken for that CFG edge + // The cost for taken branches in the first case is P + U + // Let F = SuccFreq - Qin + // The cost in the second case (assuming independence), given the layout: + // BB, Succ, (C+Succ), D, Dom or the layout: + // BB, Succ, D, Dom, (C+Succ) + // is Qout + max(F, Qin) * U + min(F, Qin) + // compare P + U vs Qout + P * U + Qin. + // + // The 3rd and 4th cases cover when Dom would be chosen to follow Succ. + // + // For the 3rd case, the cost is P + 2 * V + // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V + // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V + if (UProb > AdjustedSuccSumProb / 2 && + !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb, + Chain, BlockFilter)) + // Cases 3 & 4 + return greaterWithBias( + (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb), + EntryFreq); + // Cases 1 & 2 + return greaterWithBias((P + U), + (Qout + std::min(Qin, F) * AdjustedSuccSumProb + + std::max(Qin, F) * UProb), + EntryFreq); +} + +/// Check for a trellis layout. \p BB is the upper part of a trellis if its +/// successors form the lower part of a trellis. A successor set S forms the +/// lower part of a trellis if all of the predecessors of S are either in S or +/// have all of S as successors. We ignore trellises where BB doesn't have 2 +/// successors because for fewer than 2, it's trivial, and for 3 or greater they +/// are very uncommon and complex to compute optimally. Allowing edges within S +/// is not strictly a trellis, but the same algorithm works, so we allow it. +bool MachineBlockPlacement::isTrellis( + const MachineBasicBlock *BB, + const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs, + const BlockChain &Chain, const BlockFilterSet *BlockFilter) { + // Technically BB could form a trellis with branching factor higher than 2. + // But that's extremely uncommon. + if (BB->succ_size() != 2 || ViableSuccs.size() != 2) + return false; + + SmallPtrSet<const MachineBasicBlock *, 2> Successors(BB->succ_begin(), + BB->succ_end()); + // To avoid reviewing the same predecessors twice. + SmallPtrSet<const MachineBasicBlock *, 8> SeenPreds; + + for (MachineBasicBlock *Succ : ViableSuccs) { + int PredCount = 0; + for (auto SuccPred : Succ->predecessors()) { + // Allow triangle successors, but don't count them. + if (Successors.count(SuccPred)) { + // Make sure that it is actually a triangle. + for (MachineBasicBlock *CheckSucc : SuccPred->successors()) + if (!Successors.count(CheckSucc)) + return false; + continue; + } + const BlockChain *PredChain = BlockToChain[SuccPred]; + if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) || + PredChain == &Chain || PredChain == BlockToChain[Succ]) continue; - // Check whether the optional branch has exactly one BB. - if (Pred->pred_size() > 1 || *Pred->pred_begin() != BB) + ++PredCount; + // Perform the successor check only once. + if (!SeenPreds.insert(SuccPred).second) continue; - // Check whether the optional branch is small. - if (Pred->size() < OutlineOptionalThreshold) + if (!hasSameSuccessors(*SuccPred, Successors)) return false; } - return true; - } else + // If one of the successors has only BB as a predecessor, it is not a + // trellis. + if (PredCount < 1) + return false; + } + return true; +} + +/// Pick the highest total weight pair of edges that can both be laid out. +/// The edges in \p Edges[0] are assumed to have a different destination than +/// the edges in \p Edges[1]. Simple counting shows that the best pair is either +/// the individual highest weight edges to the 2 different destinations, or in +/// case of a conflict, one of them should be replaced with a 2nd best edge. +std::pair<MachineBlockPlacement::WeightedEdge, + MachineBlockPlacement::WeightedEdge> +MachineBlockPlacement::getBestNonConflictingEdges( + const MachineBasicBlock *BB, + MutableArrayRef<SmallVector<MachineBlockPlacement::WeightedEdge, 8>> + Edges) { + // Sort the edges, and then for each successor, find the best incoming + // predecessor. If the best incoming predecessors aren't the same, + // then that is clearly the best layout. If there is a conflict, one of the + // successors will have to fallthrough from the second best predecessor. We + // compare which combination is better overall. + + // Sort for highest frequency. + auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; }; + + std::stable_sort(Edges[0].begin(), Edges[0].end(), Cmp); + std::stable_sort(Edges[1].begin(), Edges[1].end(), Cmp); + auto BestA = Edges[0].begin(); + auto BestB = Edges[1].begin(); + // Arrange for the correct answer to be in BestA and BestB + // If the 2 best edges don't conflict, the answer is already there. + if (BestA->Src == BestB->Src) { + // Compare the total fallthrough of (Best + Second Best) for both pairs + auto SecondBestA = std::next(BestA); + auto SecondBestB = std::next(BestB); + BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight; + BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight; + if (BestAScore < BestBScore) + BestA = SecondBestA; + else + BestB = SecondBestB; + } + // Arrange for the BB edge to be in BestA if it exists. + if (BestB->Src == BB) + std::swap(BestA, BestB); + return std::make_pair(*BestA, *BestB); +} + +/// Get the best successor from \p BB based on \p BB being part of a trellis. +/// We only handle trellises with 2 successors, so the algorithm is +/// straightforward: Find the best pair of edges that don't conflict. We find +/// the best incoming edge for each successor in the trellis. If those conflict, +/// we consider which of them should be replaced with the second best. +/// Upon return the two best edges will be in \p BestEdges. If one of the edges +/// comes from \p BB, it will be in \p BestEdges[0] +MachineBlockPlacement::BlockAndTailDupResult +MachineBlockPlacement::getBestTrellisSuccessor( + const MachineBasicBlock *BB, + const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs, + BranchProbability AdjustedSumProb, const BlockChain &Chain, + const BlockFilterSet *BlockFilter) { + + BlockAndTailDupResult Result = {nullptr, false}; + SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(), + BB->succ_end()); + + // We assume size 2 because it's common. For general n, we would have to do + // the Hungarian algorithm, but it's not worth the complexity because more + // than 2 successors is fairly uncommon, and a trellis even more so. + if (Successors.size() != 2 || ViableSuccs.size() != 2) + return Result; + + // Collect the edge frequencies of all edges that form the trellis. + SmallVector<WeightedEdge, 8> Edges[2]; + int SuccIndex = 0; + for (auto Succ : ViableSuccs) { + for (MachineBasicBlock *SuccPred : Succ->predecessors()) { + // Skip any placed predecessors that are not BB + if (SuccPred != BB) + if ((BlockFilter && !BlockFilter->count(SuccPred)) || + BlockToChain[SuccPred] == &Chain || + BlockToChain[SuccPred] == BlockToChain[Succ]) + continue; + BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) * + MBPI->getEdgeProbability(SuccPred, Succ); + Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ}); + } + ++SuccIndex; + } + + // Pick the best combination of 2 edges from all the edges in the trellis. + WeightedEdge BestA, BestB; + std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges); + + if (BestA.Src != BB) { + // If we have a trellis, and BB doesn't have the best fallthrough edges, + // we shouldn't choose any successor. We've already looked and there's a + // better fallthrough edge for all the successors. + DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n"); + return Result; + } + + // Did we pick the triangle edge? If tail-duplication is profitable, do + // that instead. Otherwise merge the triangle edge now while we know it is + // optimal. + if (BestA.Dest == BestB.Src) { + // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2 + // would be better. + MachineBasicBlock *Succ1 = BestA.Dest; + MachineBasicBlock *Succ2 = BestB.Dest; + // Check to see if tail-duplication would be profitable. + if (TailDupPlacement && shouldTailDuplicate(Succ2) && + canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) && + isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1), + Chain, BlockFilter)) { + DEBUG(BranchProbability Succ2Prob = getAdjustedProbability( + MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb); + dbgs() << " Selected: " << getBlockName(Succ2) + << ", probability: " << Succ2Prob << " (Tail Duplicate)\n"); + Result.BB = Succ2; + Result.ShouldTailDup = true; + return Result; + } + } + // We have already computed the optimal edge for the other side of the + // trellis. + ComputedEdges[BestB.Src] = { BestB.Dest, false }; + + auto TrellisSucc = BestA.Dest; + DEBUG(BranchProbability SuccProb = getAdjustedProbability( + MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb); + dbgs() << " Selected: " << getBlockName(TrellisSucc) + << ", probability: " << SuccProb << " (Trellis)\n"); + Result.BB = TrellisSucc; + return Result; +} + +/// When the option TailDupPlacement is on, this method checks if the +/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated +/// into all of its unplaced, unfiltered predecessors, that are not BB. +bool MachineBlockPlacement::canTailDuplicateUnplacedPreds( + const MachineBasicBlock *BB, MachineBasicBlock *Succ, + const BlockChain &Chain, const BlockFilterSet *BlockFilter) { + if (!shouldTailDuplicate(Succ)) return false; + + // For CFG checking. + SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(), + BB->succ_end()); + for (MachineBasicBlock *Pred : Succ->predecessors()) { + // Make sure all unplaced and unfiltered predecessors can be + // tail-duplicated into. + // Skip any blocks that are already placed or not in this loop. + if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) + || BlockToChain[Pred] == &Chain) + continue; + if (!TailDup.canTailDuplicate(Succ, Pred)) { + if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors)) + // This will result in a trellis after tail duplication, so we don't + // need to copy Succ into this predecessor. In the presence + // of a trellis tail duplication can continue to be profitable. + // For example: + // A A + // |\ |\ + // | \ | \ + // | C | C+BB + // | / | | + // |/ | | + // BB => BB | + // |\ |\/| + // | \ |/\| + // | D | D + // | / | / + // |/ |/ + // Succ Succ + // + // After BB was duplicated into C, the layout looks like the one on the + // right. BB and C now have the same successors. When considering + // whether Succ can be duplicated into all its unplaced predecessors, we + // ignore C. + // We can do this because C already has a profitable fallthrough, namely + // D. TODO(iteratee): ignore sufficiently cold predecessors for + // duplication and for this test. + // + // This allows trellises to be laid out in 2 separate chains + // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic + // because it allows the creation of 2 fallthrough paths with links + // between them, and we correctly identify the best layout for these + // CFGs. We want to extend trellises that the user created in addition + // to trellises created by tail-duplication, so we just look for the + // CFG. + continue; + return false; + } + } + return true; +} + +/// Find chains of triangles where we believe it would be profitable to +/// tail-duplicate them all, but a local analysis would not find them. +/// There are 3 ways this can be profitable: +/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with +/// longer chains) +/// 2) The chains are statically correlated. Branch probabilities have a very +/// U-shaped distribution. +/// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805] +/// If the branches in a chain are likely to be from the same side of the +/// distribution as their predecessor, but are independent at runtime, this +/// transformation is profitable. (Because the cost of being wrong is a small +/// fixed cost, unlike the standard triangle layout where the cost of being +/// wrong scales with the # of triangles.) +/// 3) The chains are dynamically correlated. If the probability that a previous +/// branch was taken positively influences whether the next branch will be +/// taken +/// We believe that 2 and 3 are common enough to justify the small margin in 1. +void MachineBlockPlacement::precomputeTriangleChains() { + struct TriangleChain { + std::vector<MachineBasicBlock *> Edges; + TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst) + : Edges({src, dst}) {} + + void append(MachineBasicBlock *dst) { + assert(getKey()->isSuccessor(dst) && + "Attempting to append a block that is not a successor."); + Edges.push_back(dst); + } + + unsigned count() const { return Edges.size() - 1; } + + MachineBasicBlock *getKey() const { + return Edges.back(); + } + }; + + if (TriangleChainCount == 0) + return; + + DEBUG(dbgs() << "Pre-computing triangle chains.\n"); + // Map from last block to the chain that contains it. This allows us to extend + // chains as we find new triangles. + DenseMap<const MachineBasicBlock *, TriangleChain> TriangleChainMap; + for (MachineBasicBlock &BB : *F) { + // If BB doesn't have 2 successors, it doesn't start a triangle. + if (BB.succ_size() != 2) + continue; + MachineBasicBlock *PDom = nullptr; + for (MachineBasicBlock *Succ : BB.successors()) { + if (!MPDT->dominates(Succ, &BB)) + continue; + PDom = Succ; + break; + } + // If BB doesn't have a post-dominating successor, it doesn't form a + // triangle. + if (PDom == nullptr) + continue; + // If PDom has a hint that it is low probability, skip this triangle. + if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100)) + continue; + // If PDom isn't eligible for duplication, this isn't the kind of triangle + // we're looking for. + if (!shouldTailDuplicate(PDom)) + continue; + bool CanTailDuplicate = true; + // If PDom can't tail-duplicate into it's non-BB predecessors, then this + // isn't the kind of triangle we're looking for. + for (MachineBasicBlock* Pred : PDom->predecessors()) { + if (Pred == &BB) + continue; + if (!TailDup.canTailDuplicate(PDom, Pred)) { + CanTailDuplicate = false; + break; + } + } + // If we can't tail-duplicate PDom to its predecessors, then skip this + // triangle. + if (!CanTailDuplicate) + continue; + + // Now we have an interesting triangle. Insert it if it's not part of an + // existing chain. + // Note: This cannot be replaced with a call insert() or emplace() because + // the find key is BB, but the insert/emplace key is PDom. + auto Found = TriangleChainMap.find(&BB); + // If it is, remove the chain from the map, grow it, and put it back in the + // map with the end as the new key. + if (Found != TriangleChainMap.end()) { + TriangleChain Chain = std::move(Found->second); + TriangleChainMap.erase(Found); + Chain.append(PDom); + TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain))); + } else { + auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom); + assert(InsertResult.second && "Block seen twice."); + (void)InsertResult; + } + } + + // Iterating over a DenseMap is safe here, because the only thing in the body + // of the loop is inserting into another DenseMap (ComputedEdges). + // ComputedEdges is never iterated, so this doesn't lead to non-determinism. + for (auto &ChainPair : TriangleChainMap) { + TriangleChain &Chain = ChainPair.second; + // Benchmarking has shown that due to branch correlation duplicating 2 or + // more triangles is profitable, despite the calculations assuming + // independence. + if (Chain.count() < TriangleChainCount) + continue; + MachineBasicBlock *dst = Chain.Edges.back(); + Chain.Edges.pop_back(); + for (MachineBasicBlock *src : reverse(Chain.Edges)) { + DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->" << + getBlockName(dst) << " as pre-computed based on triangles.\n"); + + auto InsertResult = ComputedEdges.insert({src, {dst, true}}); + assert(InsertResult.second && "Block seen twice."); + (void)InsertResult; + + dst = src; + } + } } // When profile is not present, return the StaticLikelyProb. // When profile is available, we need to handle the triangle-shape CFG. static BranchProbability getLayoutSuccessorProbThreshold( - MachineBasicBlock *BB) { + const MachineBasicBlock *BB) { if (!BB->getParent()->getFunction()->getEntryCount()) return BranchProbability(StaticLikelyProb, 100); if (BB->succ_size() == 2) { @@ -609,11 +1211,11 @@ static BranchProbability getLayoutSuccessorProbThreshold( if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) { /* See case 1 below for the cost analysis. For BB->Succ to * be taken with smaller cost, the following needs to hold: - * Prob(BB->Succ) > 2* Prob(BB->Pred) - * So the threshold T - * T = 2 * (1-Prob(BB->Pred). Since T + Prob(BB->Pred) == 1, - * We have T + T/2 = 1, i.e. T = 2/3. Also adding user specified - * branch bias, we have + * Prob(BB->Succ) > 2 * Prob(BB->Pred) + * So the threshold T in the calculation below + * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred) + * So T / (1 - T) = 2, Yielding T = 2/3 + * Also adding user specified branch bias, we have * T = (2/3)*(ProfileLikelyProb/50) * = (2*ProfileLikelyProb)/150) */ @@ -625,10 +1227,17 @@ static BranchProbability getLayoutSuccessorProbThreshold( /// Checks to see if the layout candidate block \p Succ has a better layout /// predecessor than \c BB. If yes, returns true. +/// \p SuccProb: The probability adjusted for only remaining blocks. +/// Only used for logging +/// \p RealSuccProb: The un-adjusted probability. +/// \p Chain: The chain that BB belongs to and Succ is being considered for. +/// \p BlockFilter: if non-null, the set of blocks that make up the loop being +/// considered bool MachineBlockPlacement::hasBetterLayoutPredecessor( - MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &SuccChain, - BranchProbability SuccProb, BranchProbability RealSuccProb, - BlockChain &Chain, const BlockFilterSet *BlockFilter) { + const MachineBasicBlock *BB, const MachineBasicBlock *Succ, + const BlockChain &SuccChain, BranchProbability SuccProb, + BranchProbability RealSuccProb, const BlockChain &Chain, + const BlockFilterSet *BlockFilter) { // There isn't a better layout when there are no unscheduled predecessors. if (SuccChain.UnscheduledPredecessors == 0) @@ -689,9 +1298,9 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor( // | | | | // ---BB | | BB // | | | | - // | pred-- | Succ-- + // | Pred-- | Succ-- // | | | | - // ---succ ---pred-- + // ---Succ ---Pred-- // // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred) // = freq(S->Pred) + freq(S->BB) @@ -734,11 +1343,12 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor( // | Pred----| | S1---- // | | | | // --(S1 or S2) ---Pred-- + // | + // S2 // // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2) // + min(freq(Pred->S1), freq(Pred->S2)) // Non-topo-order cost: - // In the worst case, S2 will not get laid out after Pred. // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2). // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2)) // is 0. Then the non topo layout is better when @@ -756,13 +1366,15 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor( for (MachineBasicBlock *Pred : Succ->predecessors()) { if (Pred == Succ || BlockToChain[Pred] == &SuccChain || (BlockFilter && !BlockFilter->count(Pred)) || - BlockToChain[Pred] == &Chain) + BlockToChain[Pred] == &Chain || + // This check is redundant except for look ahead. This function is + // called for lookahead by isProfitableToTailDup when BB hasn't been + // placed yet. + (Pred == BB)) continue; // Do backward checking. // For all cases above, we need a backward checking to filter out edges that - // are not 'strongly' biased. With profile data available, the check is - // mostly redundant for case 2 (when threshold prob is set at 50%) unless S - // has more than two successors. + // are not 'strongly' biased. // BB Pred // \ / // Succ @@ -798,14 +1410,15 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor( /// breaking CFG structure, but cave and break such structures in the case of /// very hot successor edges. /// -/// \returns The best successor block found, or null if none are viable. -MachineBasicBlock * -MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, - BlockChain &Chain, - const BlockFilterSet *BlockFilter) { +/// \returns The best successor block found, or null if none are viable, along +/// with a boolean indicating if tail duplication is necessary. +MachineBlockPlacement::BlockAndTailDupResult +MachineBlockPlacement::selectBestSuccessor( + const MachineBasicBlock *BB, const BlockChain &Chain, + const BlockFilterSet *BlockFilter) { const BranchProbability HotProb(StaticLikelyProb, 100); - MachineBasicBlock *BestSucc = nullptr; + BlockAndTailDupResult BestSucc = { nullptr, false }; auto BestProb = BranchProbability::getZero(); SmallVector<MachineBasicBlock *, 4> Successors; @@ -813,22 +1426,45 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, collectViableSuccessors(BB, Chain, BlockFilter, Successors); DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB) << "\n"); + + // if we already precomputed the best successor for BB, return that if still + // applicable. + auto FoundEdge = ComputedEdges.find(BB); + if (FoundEdge != ComputedEdges.end()) { + MachineBasicBlock *Succ = FoundEdge->second.BB; + ComputedEdges.erase(FoundEdge); + BlockChain *SuccChain = BlockToChain[Succ]; + if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) && + SuccChain != &Chain && Succ == *SuccChain->begin()) + return FoundEdge->second; + } + + // if BB is part of a trellis, Use the trellis to determine the optimal + // fallthrough edges + if (isTrellis(BB, Successors, Chain, BlockFilter)) + return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain, + BlockFilter); + + // For blocks with CFG violations, we may be able to lay them out anyway with + // tail-duplication. We keep this vector so we can perform the probability + // calculations the minimum number of times. + SmallVector<std::tuple<BranchProbability, MachineBasicBlock *>, 4> + DupCandidates; for (MachineBasicBlock *Succ : Successors) { auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ); BranchProbability SuccProb = getAdjustedProbability(RealSuccProb, AdjustedSumProb); - // This heuristic is off by default. - if (shouldPredBlockBeOutlined(BB, Succ, Chain, BlockFilter, SuccProb, - HotProb)) - return Succ; - BlockChain &SuccChain = *BlockToChain[Succ]; // Skip the edge \c BB->Succ if block \c Succ has a better layout // predecessor that yields lower global cost. if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb, - Chain, BlockFilter)) + Chain, BlockFilter)) { + // If tail duplication would make Succ profitable, place it. + if (TailDupPlacement && shouldTailDuplicate(Succ)) + DupCandidates.push_back(std::make_tuple(SuccProb, Succ)); continue; + } DEBUG( dbgs() << " Candidate: " << getBlockName(Succ) << ", probability: " @@ -836,17 +1472,48 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "") << "\n"); - if (BestSucc && BestProb >= SuccProb) { + if (BestSucc.BB && BestProb >= SuccProb) { DEBUG(dbgs() << " Not the best candidate, continuing\n"); continue; } DEBUG(dbgs() << " Setting it as best candidate\n"); - BestSucc = Succ; + BestSucc.BB = Succ; BestProb = SuccProb; } - if (BestSucc) - DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc) << "\n"); + // Handle the tail duplication candidates in order of decreasing probability. + // Stop at the first one that is profitable. Also stop if they are less + // profitable than BestSucc. Position is important because we preserve it and + // prefer first best match. Here we aren't comparing in order, so we capture + // the position instead. + if (DupCandidates.size() != 0) { + auto cmp = + [](const std::tuple<BranchProbability, MachineBasicBlock *> &a, + const std::tuple<BranchProbability, MachineBasicBlock *> &b) { + return std::get<0>(a) > std::get<0>(b); + }; + std::stable_sort(DupCandidates.begin(), DupCandidates.end(), cmp); + } + for(auto &Tup : DupCandidates) { + BranchProbability DupProb; + MachineBasicBlock *Succ; + std::tie(DupProb, Succ) = Tup; + if (DupProb < BestProb) + break; + if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter) + && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) { + DEBUG( + dbgs() << " Candidate: " << getBlockName(Succ) << ", probability: " + << DupProb + << " (Tail Duplicate)\n"); + BestSucc.BB = Succ; + BestSucc.ShouldTailDup = true; + break; + } + } + + if (BestSucc.BB) + DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n"); return BestSucc; } @@ -862,7 +1529,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, /// /// \returns The best block found, or null if none are viable. MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock( - BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) { + const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) { // Once we need to walk the worklist looking for a candidate, cleanup the // worklist of already placed entries. // FIXME: If this shows up on profiles, it could be folded (at the cost of @@ -881,13 +1548,15 @@ MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock( MachineBasicBlock *BestBlock = nullptr; BlockFrequency BestFreq; for (MachineBasicBlock *MBB : WorkList) { - assert(MBB->isEHPad() == IsEHPad); + assert(MBB->isEHPad() == IsEHPad && + "EHPad mismatch between block and work list."); BlockChain &SuccChain = *BlockToChain[MBB]; if (&SuccChain == &Chain) continue; - assert(SuccChain.UnscheduledPredecessors == 0 && "Found CFG-violating block"); + assert(SuccChain.UnscheduledPredecessors == 0 && + "Found CFG-violating block"); BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB); DEBUG(dbgs() << " " << getBlockName(MBB) << " -> "; @@ -948,16 +1617,19 @@ MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock( } void MachineBlockPlacement::fillWorkLists( - MachineBasicBlock *MBB, + const MachineBasicBlock *MBB, SmallPtrSetImpl<BlockChain *> &UpdatedPreds, const BlockFilterSet *BlockFilter = nullptr) { BlockChain &Chain = *BlockToChain[MBB]; if (!UpdatedPreds.insert(&Chain).second) return; - assert(Chain.UnscheduledPredecessors == 0); + assert( + Chain.UnscheduledPredecessors == 0 && + "Attempting to place block with unscheduled predecessors in worklist."); for (MachineBasicBlock *ChainBB : Chain) { - assert(BlockToChain[ChainBB] == &Chain); + assert(BlockToChain[ChainBB] == &Chain && + "Block in chain doesn't match BlockToChain map."); for (MachineBasicBlock *Pred : ChainBB->predecessors()) { if (BlockFilter && !BlockFilter->count(Pred)) continue; @@ -970,23 +1642,23 @@ void MachineBlockPlacement::fillWorkLists( if (Chain.UnscheduledPredecessors != 0) return; - MBB = *Chain.begin(); - if (MBB->isEHPad()) - EHPadWorkList.push_back(MBB); + MachineBasicBlock *BB = *Chain.begin(); + if (BB->isEHPad()) + EHPadWorkList.push_back(BB); else - BlockWorkList.push_back(MBB); + BlockWorkList.push_back(BB); } void MachineBlockPlacement::buildChain( - MachineBasicBlock *BB, BlockChain &Chain, + const MachineBasicBlock *HeadBB, BlockChain &Chain, BlockFilterSet *BlockFilter) { - assert(BB && "BB must not be null.\n"); - assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match.\n"); + assert(HeadBB && "BB must not be null.\n"); + assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n"); MachineFunction::iterator PrevUnplacedBlockIt = F->begin(); - MachineBasicBlock *LoopHeaderBB = BB; + const MachineBasicBlock *LoopHeaderBB = HeadBB; markChainSuccessors(Chain, LoopHeaderBB, BlockFilter); - BB = *std::prev(Chain.end()); + MachineBasicBlock *BB = *std::prev(Chain.end()); for (;;) { assert(BB && "null block found at end of chain in loop."); assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop."); @@ -995,7 +1667,11 @@ void MachineBlockPlacement::buildChain( // Look for the best viable successor if there is one to place immediately // after this block. - MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter); + auto Result = selectBestSuccessor(BB, Chain, BlockFilter); + MachineBasicBlock* BestSucc = Result.BB; + bool ShouldTailDup = Result.ShouldTailDup; + if (TailDupPlacement) + ShouldTailDup |= (BestSucc && shouldTailDuplicate(BestSucc)); // If an immediate successor isn't available, look for the best viable // block among those we've identified as not violating the loop's CFG at @@ -1016,7 +1692,7 @@ void MachineBlockPlacement::buildChain( // Placement may have changed tail duplication opportunities. // Check for that now. - if (TailDupPlacement && BestSucc) { + if (TailDupPlacement && BestSucc && ShouldTailDup) { // If the chosen successor was duplicated into all its predecessors, // don't bother laying it out, just go round the loop again with BB as // the chain end. @@ -1052,7 +1728,7 @@ void MachineBlockPlacement::buildChain( /// unconditional jump (for the backedge) rotating it in front of the loop /// header is always profitable. MachineBasicBlock * -MachineBlockPlacement::findBestLoopTop(MachineLoop &L, +MachineBlockPlacement::findBestLoopTop(const MachineLoop &L, const BlockFilterSet &LoopBlockSet) { // Placing the latch block before the header may introduce an extra branch // that skips this block the first time the loop is executed, which we want @@ -1116,7 +1792,7 @@ MachineBlockPlacement::findBestLoopTop(MachineLoop &L, /// block to layout at the top of the loop. Typically this is done to maximize /// fallthrough opportunities. MachineBasicBlock * -MachineBlockPlacement::findBestLoopExit(MachineLoop &L, +MachineBlockPlacement::findBestLoopExit(const MachineLoop &L, const BlockFilterSet &LoopBlockSet) { // We don't want to layout the loop linearly in all cases. If the loop header // is just a normal basic block in the loop, we want to look for what block @@ -1235,12 +1911,18 @@ MachineBlockPlacement::findBestLoopExit(MachineLoop &L, /// branches. For example, if the loop has fallthrough into its header and out /// of its bottom already, don't rotate it. void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain, - MachineBasicBlock *ExitingBB, + const MachineBasicBlock *ExitingBB, const BlockFilterSet &LoopBlockSet) { if (!ExitingBB) return; MachineBasicBlock *Top = *LoopChain.begin(); + MachineBasicBlock *Bottom = *std::prev(LoopChain.end()); + + // If ExitingBB is already the last one in a chain then nothing to do. + if (Bottom == ExitingBB) + return; + bool ViableTopFallthrough = false; for (MachineBasicBlock *Pred : Top->predecessors()) { BlockChain *PredChain = BlockToChain[Pred]; @@ -1255,7 +1937,6 @@ void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain, // bottom is a viable exiting block. If so, bail out as rotating will // introduce an unnecessary branch. if (ViableTopFallthrough) { - MachineBasicBlock *Bottom = *std::prev(LoopChain.end()); for (MachineBasicBlock *Succ : Bottom->successors()) { BlockChain *SuccChain = BlockToChain[Succ]; if (!LoopBlockSet.count(Succ) && @@ -1268,6 +1949,36 @@ void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain, if (ExitIt == LoopChain.end()) return; + // Rotating a loop exit to the bottom when there is a fallthrough to top + // trades the entry fallthrough for an exit fallthrough. + // If there is no bottom->top edge, but the chosen exit block does have + // a fallthrough, we break that fallthrough for nothing in return. + + // Let's consider an example. We have a built chain of basic blocks + // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block. + // By doing a rotation we get + // Bk+1, ..., Bn, B1, ..., Bk + // Break of fallthrough to B1 is compensated by a fallthrough from Bk. + // If we had a fallthrough Bk -> Bk+1 it is broken now. + // It might be compensated by fallthrough Bn -> B1. + // So we have a condition to avoid creation of extra branch by loop rotation. + // All below must be true to avoid loop rotation: + // If there is a fallthrough to top (B1) + // There was fallthrough from chosen exit block (Bk) to next one (Bk+1) + // There is no fallthrough from bottom (Bn) to top (B1). + // Please note that there is no exit fallthrough from Bn because we checked it + // above. + if (ViableTopFallthrough) { + assert(std::next(ExitIt) != LoopChain.end() && + "Exit should not be last BB"); + MachineBasicBlock *NextBlockInChain = *std::next(ExitIt); + if (ExitingBB->isSuccessor(NextBlockInChain)) + if (!Bottom->isSuccessor(Top)) + return; + } + + DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB) + << " at bottom\n"); std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end()); } @@ -1285,7 +1996,8 @@ void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain, /// Therefore, the cost for a given rotation is the sum of costs listed above. /// We select the best rotation with the smallest cost. void MachineBlockPlacement::rotateLoopWithProfile( - BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet) { + BlockChain &LoopChain, const MachineLoop &L, + const BlockFilterSet &LoopBlockSet) { auto HeaderBB = L.getHeader(); auto HeaderIter = find(LoopChain, HeaderBB); auto RotationPos = LoopChain.end(); @@ -1422,7 +2134,7 @@ void MachineBlockPlacement::rotateLoopWithProfile( /// When profile data is available, exclude cold blocks from the returned set; /// otherwise, collect all blocks in the loop. MachineBlockPlacement::BlockFilterSet -MachineBlockPlacement::collectLoopBlockSet(MachineLoop &L) { +MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) { BlockFilterSet LoopBlockSet; // Filter cold blocks off from LoopBlockSet when profile data is available. @@ -1459,14 +2171,16 @@ MachineBlockPlacement::collectLoopBlockSet(MachineLoop &L) { /// as much as possible. We can then stitch the chains together in a way which /// both preserves the topological structure and minimizes taken conditional /// branches. -void MachineBlockPlacement::buildLoopChains(MachineLoop &L) { +void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) { // First recurse through any nested loops, building chains for those inner // loops. - for (MachineLoop *InnerLoop : L) + for (const MachineLoop *InnerLoop : L) buildLoopChains(*InnerLoop); - assert(BlockWorkList.empty()); - assert(EHPadWorkList.empty()); + assert(BlockWorkList.empty() && + "BlockWorkList not empty when starting to build loop chains."); + assert(EHPadWorkList.empty() && + "EHPadWorkList not empty when starting to build loop chains."); BlockFilterSet LoopBlockSet = collectLoopBlockSet(L); // Check if we have profile data for this function. If yes, we will rotate @@ -1496,10 +2210,11 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) { // walk the blocks, and use a set to prevent visiting a particular chain // twice. SmallPtrSet<BlockChain *, 4> UpdatedPreds; - assert(LoopChain.UnscheduledPredecessors == 0); + assert(LoopChain.UnscheduledPredecessors == 0 && + "LoopChain should not have unscheduled predecessors."); UpdatedPreds.insert(&LoopChain); - for (MachineBasicBlock *LoopBB : LoopBlockSet) + for (const MachineBasicBlock *LoopBB : LoopBlockSet) fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet); buildChain(LoopTop, LoopChain, &LoopBlockSet); @@ -1533,7 +2248,7 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) { if (!LoopBlockSet.empty()) { BadLoop = true; - for (MachineBasicBlock *LoopBB : LoopBlockSet) + for (const MachineBasicBlock *LoopBB : LoopBlockSet) dbgs() << "Loop contains blocks never placed into a chain!\n" << " Loop header: " << getBlockName(*L.block_begin()) << "\n" << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n" @@ -1546,31 +2261,6 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) { EHPadWorkList.clear(); } -/// When OutlineOpitonalBranches is on, this method collects BBs that -/// dominates all terminator blocks of the function \p F. -void MachineBlockPlacement::collectMustExecuteBBs() { - if (OutlineOptionalBranches) { - // Find the nearest common dominator of all of F's terminators. - MachineBasicBlock *Terminator = nullptr; - for (MachineBasicBlock &MBB : *F) { - if (MBB.succ_size() == 0) { - if (Terminator == nullptr) - Terminator = &MBB; - else - Terminator = MDT->findNearestCommonDominator(Terminator, &MBB); - } - } - - // MBBs dominating this common dominator are unavoidable. - UnavoidableBlocks.clear(); - for (MachineBasicBlock &MBB : *F) { - if (MDT->dominates(&MBB, Terminator)) { - UnavoidableBlocks.insert(&MBB); - } - } - } -} - void MachineBlockPlacement::buildCFGChains() { // Ensure that every BB in the function has an associated chain to simplify // the assumptions of the remaining algorithm. @@ -1605,16 +2295,15 @@ void MachineBlockPlacement::buildCFGChains() { } } - // Turned on with OutlineOptionalBranches option - collectMustExecuteBBs(); - // Build any loop-based chains. PreferredLoopExit = nullptr; for (MachineLoop *L : *MLI) buildLoopChains(*L); - assert(BlockWorkList.empty()); - assert(EHPadWorkList.empty()); + assert(BlockWorkList.empty() && + "BlockWorkList should be empty before building final chain."); + assert(EHPadWorkList.empty() && + "EHPadWorkList should be empty before building final chain."); SmallPtrSet<BlockChain *, 4> UpdatedPreds; for (MachineBasicBlock &MBB : *F) @@ -1839,7 +2528,7 @@ void MachineBlockPlacement::alignBlocks() { /// @return true if \p BB was removed. bool MachineBlockPlacement::repeatedlyTailDuplicateBlock( MachineBasicBlock *BB, MachineBasicBlock *&LPred, - MachineBasicBlock *LoopHeaderBB, + const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain, BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt) { bool Removed, DuplicatedToLPred; @@ -1901,21 +2590,16 @@ bool MachineBlockPlacement::repeatedlyTailDuplicateBlock( /// \return - True if the block was duplicated into all preds and removed. bool MachineBlockPlacement::maybeTailDuplicateBlock( MachineBasicBlock *BB, MachineBasicBlock *LPred, - const BlockChain &Chain, BlockFilterSet *BlockFilter, + BlockChain &Chain, BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt, bool &DuplicatedToLPred) { - DuplicatedToLPred = false; + if (!shouldTailDuplicate(BB)) + return false; + DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber() << "\n"); - bool IsSimple = TailDup.isSimpleBB(BB); - // Blocks with single successors don't create additional fallthrough - // opportunities. Don't duplicate them. TODO: When conditional exits are - // analyzable, allow them to be duplicated. - if (!IsSimple && BB->succ_size() == 1) - return false; - if (!TailDup.shouldTailDuplicate(IsSimple, *BB)) - return false; + // This has to be a callback because none of it can be done after // BB is deleted. bool Removed = false; @@ -1967,6 +2651,7 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock( llvm::function_ref<void(MachineBasicBlock*)>(RemovalCallback); SmallVector<MachineBasicBlock *, 8> DuplicatedPreds; + bool IsSimple = TailDup.isSimpleBB(BB); TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds, &RemovalCallbackRef); @@ -2006,25 +2691,46 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { MLI = &getAnalysis<MachineLoopInfo>(); TII = MF.getSubtarget().getInstrInfo(); TLI = MF.getSubtarget().getTargetLowering(); - MDT = &getAnalysis<MachineDominatorTree>(); + MPDT = nullptr; // Initialize PreferredLoopExit to nullptr here since it may never be set if // there are no MachineLoops. PreferredLoopExit = nullptr; + assert(BlockToChain.empty() && + "BlockToChain map should be empty before starting placement."); + assert(ComputedEdges.empty() && + "Computed Edge map should be empty before starting placement."); + + unsigned TailDupSize = TailDupPlacementThreshold; + // If only the aggressive threshold is explicitly set, use it. + if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 && + TailDupPlacementThreshold.getNumOccurrences() == 0) + TailDupSize = TailDupPlacementAggressiveThreshold; + + TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>(); + // For agressive optimization, we can adjust some thresholds to be less + // conservative. + if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) { + // At O3 we should be more willing to copy blocks for tail duplication. This + // increases size pressure, so we only do it at O3 + // Do this unless only the regular threshold is explicitly set. + if (TailDupPlacementThreshold.getNumOccurrences() == 0 || + TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0) + TailDupSize = TailDupPlacementAggressiveThreshold; + } + if (TailDupPlacement) { - unsigned TailDupSize = TailDuplicatePlacementThreshold; + MPDT = &getAnalysis<MachinePostDominatorTree>(); if (MF.getFunction()->optForSize()) TailDupSize = 1; TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize); + precomputeTriangleChains(); } - assert(BlockToChain.empty()); - buildCFGChains(); // Changing the layout can create new tail merging opportunities. - TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>(); // TailMerge can create jump into if branches that make CFG irreducible for // HW that requires structured CFG. bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() && @@ -2032,7 +2738,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { BranchFoldPlacement; // No tail merging opportunities if the block number is less than four. if (MF.size() > 3 && EnableTailMerge) { - unsigned TailMergeSize = TailDuplicatePlacementThreshold + 1; + unsigned TailMergeSize = TailDupSize + 1; BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI, *MBPI, TailMergeSize); @@ -2041,8 +2747,10 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { /*AfterBlockPlacement=*/true)) { // Redo the layout if tail merging creates/removes/moves blocks. BlockToChain.clear(); - // Must redo the dominator tree if blocks were changed. - MDT->runOnMachineFunction(MF); + ComputedEdges.clear(); + // Must redo the post-dominator tree if blocks were changed. + if (MPDT) + MPDT->runOnMachineFunction(MF); ChainAllocator.DestroyAll(); buildCFGChains(); } @@ -2052,6 +2760,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { alignBlocks(); BlockToChain.clear(); + ComputedEdges.clear(); ChainAllocator.DestroyAll(); if (AlignAllBlock) @@ -2067,6 +2776,12 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { MBI->setAlignment(AlignAllNonFallThruBlocks); } } + if (ViewBlockLayoutWithBFI != GVDT_None && + (ViewBlockFreqFuncName.empty() || + F->getFunction()->getName().equals(ViewBlockFreqFuncName))) { + MBFI->view("MBP." + MF.getName(), false); + } + // We always return true as we have no way to track whether the final order // differs from the original order. diff --git a/contrib/llvm/lib/CodeGen/MachineCSE.cpp b/contrib/llvm/lib/CodeGen/MachineCSE.cpp index 0766f46..582ff13 100644 --- a/contrib/llvm/lib/CodeGen/MachineCSE.cpp +++ b/contrib/llvm/lib/CodeGen/MachineCSE.cpp @@ -13,7 +13,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/ScopedHashTable.h" #include "llvm/ADT/SmallSet.h" @@ -22,6 +21,7 @@ #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/Support/Debug.h" #include "llvm/Support/RecyclingAllocator.h" #include "llvm/Support/raw_ostream.h" @@ -108,12 +108,12 @@ namespace { char MachineCSE::ID = 0; char &llvm::MachineCSEID = MachineCSE::ID; -INITIALIZE_PASS_BEGIN(MachineCSE, "machine-cse", - "Machine Common Subexpression Elimination", false, false) +INITIALIZE_PASS_BEGIN(MachineCSE, DEBUG_TYPE, + "Machine Common Subexpression Elimination", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(MachineCSE, "machine-cse", - "Machine Common Subexpression Elimination", false, false) +INITIALIZE_PASS_END(MachineCSE, DEBUG_TYPE, + "Machine Common Subexpression Elimination", false, false) /// The source register of a COPY machine instruction can be propagated to all /// its users, and this propagation could increase the probability of finding @@ -180,8 +180,8 @@ MachineCSE::isPhysDefTriviallyDead(unsigned Reg, I = skipDebugInstructionsForward(I, E); if (I == E) - // Reached end of block, register is obviously dead. - return true; + // Reached end of block, we don't know if register is dead or not. + return false; bool SeenDef = false; for (const MachineOperand &MO : I->operands()) { diff --git a/contrib/llvm/lib/CodeGen/MachineCombiner.cpp b/contrib/llvm/lib/CodeGen/MachineCombiner.cpp index 5beed5f..e6f80db 100644 --- a/contrib/llvm/lib/CodeGen/MachineCombiner.cpp +++ b/contrib/llvm/lib/CodeGen/MachineCombiner.cpp @@ -8,11 +8,9 @@ //===----------------------------------------------------------------------===// // // The machine combiner pass uses machine trace metrics to ensure the combined -// instructions does not lengthen the critical path or the resource depth. +// instructions do not lengthen the critical path or the resource depth. //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "machine-combiner" - #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineDominators.h" @@ -32,6 +30,8 @@ using namespace llvm; +#define DEBUG_TYPE "machine-combiner" + STATISTIC(NumInstCombined, "Number of machineinst combined"); namespace { @@ -86,11 +86,11 @@ private: char MachineCombiner::ID = 0; char &llvm::MachineCombinerID = MachineCombiner::ID; -INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner", +INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) -INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner", +INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner", false, false) void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { @@ -135,7 +135,9 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth for (auto *InstrPtr : InsInstrs) { // for each Use unsigned IDepth = 0; - DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(TII); dbgs() << "\n";); + DEBUG(dbgs() << "NEW INSTR "; + InstrPtr->print(dbgs(), TII); + dbgs() << "\n";); for (const MachineOperand &MO : InstrPtr->operands()) { // Check for virtual register operand. if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) @@ -352,6 +354,19 @@ bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) { return false; } +static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, + SmallVector<MachineInstr *, 16> InsInstrs, + SmallVector<MachineInstr *, 16> DelInstrs, + MachineTraceMetrics *Traces) { + for (auto *InstrPtr : InsInstrs) + MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr); + for (auto *InstrPtr : DelInstrs) + InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); + ++NumInstCombined; + Traces->invalidate(MBB); + Traces->verifyAnalysis(); +} + /// Substitute a slow code sequence with a faster one by /// evaluating instruction combining pattern. /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction @@ -406,7 +421,6 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { DenseMap<unsigned, unsigned> InstrIdxForVirtReg; if (!MinInstr) MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); - MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); Traces->verifyAnalysis(); TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, InstrIdxForVirtReg); @@ -426,23 +440,23 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { // fewer instructions OR // the new sequence neither lengthens the critical path nor increases // resource pressure. - if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) || - (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, - DelInstrs, InstrIdxForVirtReg, P) && - preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { - for (auto *InstrPtr : InsInstrs) - MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr); - for (auto *InstrPtr : DelInstrs) - InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); - - Changed = true; - ++NumInstCombined; - - Traces->invalidate(MBB); - Traces->verifyAnalysis(); + if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount)) { + insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces); // Eagerly stop after the first pattern fires. + Changed = true; break; } else { + // Calculating the trace metrics may be expensive, + // so only do this when necessary. + MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); + if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs, + InstrIdxForVirtReg, P) && + preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) { + insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces); + // Eagerly stop after the first pattern fires. + Changed = true; + break; + } // Cleanup instructions of the alternative code sequence. There is no // use for them. MachineFunction *MF = MBB->getParent(); diff --git a/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp index 5de6dec..7d5a681 100644 --- a/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -11,7 +11,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" @@ -19,6 +18,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -27,7 +27,7 @@ #include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; -#define DEBUG_TYPE "codegen-cp" +#define DEBUG_TYPE "machine-cp" STATISTIC(NumDeletes, "Number of dead copies deleted"); @@ -79,7 +79,7 @@ namespace { char MachineCopyPropagation::ID = 0; char &llvm::MachineCopyPropagationID = MachineCopyPropagation::ID; -INITIALIZE_PASS(MachineCopyPropagation, "machine-cp", +INITIALIZE_PASS(MachineCopyPropagation, DEBUG_TYPE, "Machine Copy Propagation Pass", false, false) /// Remove any entry in \p Map where the register is a subregister or equal to @@ -291,17 +291,9 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) { if (MO.isDef()) { Defs.push_back(Reg); - } else { + continue; + } else if (MO.readsReg()) ReadRegister(Reg); - } - // Treat undef use like defs for copy propagation but not for - // dead copy. We would need to do a liveness check to be sure the copy - // is dead for undef uses. - // The backends are allowed to do whatever they want with undef value - // and we cannot be sure this register will not be rewritten to break - // some false dependencies for the hardware for instance. - if (MO.isUndef()) - Defs.push_back(Reg); } // The instruction has a register mask operand which means that it clobbers diff --git a/contrib/llvm/lib/CodeGen/MachineDominanceFrontier.cpp b/contrib/llvm/lib/CodeGen/MachineDominanceFrontier.cpp index acb7c48..b559e4e 100644 --- a/contrib/llvm/lib/CodeGen/MachineDominanceFrontier.cpp +++ b/contrib/llvm/lib/CodeGen/MachineDominanceFrontier.cpp @@ -12,11 +12,11 @@ #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/Passes.h" - using namespace llvm; namespace llvm { -template class DominanceFrontierBase<MachineBasicBlock>; +template class DominanceFrontierBase<MachineBasicBlock, false>; +template class DominanceFrontierBase<MachineBasicBlock, true>; template class ForwardDominanceFrontierBase<MachineBasicBlock>; } diff --git a/contrib/llvm/lib/CodeGen/MachineDominators.cpp b/contrib/llvm/lib/CodeGen/MachineDominators.cpp index 303a6a9..845e823 100644 --- a/contrib/llvm/lib/CodeGen/MachineDominators.cpp +++ b/contrib/llvm/lib/CodeGen/MachineDominators.cpp @@ -13,8 +13,8 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/Passes.h" #include "llvm/ADT/SmallBitVector.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/Support/CommandLine.h" using namespace llvm; @@ -31,7 +31,7 @@ static cl::opt<bool, true> VerifyMachineDomInfoX( namespace llvm { template class DomTreeNodeBase<MachineBasicBlock>; -template class DominatorTreeBase<MachineBasicBlock>; +template class DominatorTreeBase<MachineBasicBlock, false>; // DomTreeBase } char MachineDominatorTree::ID = 0; @@ -49,32 +49,29 @@ void MachineDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const { bool MachineDominatorTree::runOnMachineFunction(MachineFunction &F) { CriticalEdgesToSplit.clear(); NewBBs.clear(); + DT.reset(new DomTreeBase<MachineBasicBlock>()); DT->recalculate(F); - return false; } MachineDominatorTree::MachineDominatorTree() : MachineFunctionPass(ID) { initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry()); - DT = new DominatorTreeBase<MachineBasicBlock>(false); -} - -MachineDominatorTree::~MachineDominatorTree() { - delete DT; } void MachineDominatorTree::releaseMemory() { - DT->releaseMemory(); + CriticalEdgesToSplit.clear(); + DT.reset(nullptr); } void MachineDominatorTree::verifyAnalysis() const { - if (VerifyMachineDomInfo) + if (DT && VerifyMachineDomInfo) verifyDomTree(); } void MachineDominatorTree::print(raw_ostream &OS, const Module*) const { - DT->print(OS); + if (DT) + DT->print(OS); } void MachineDominatorTree::applySplitCriticalEdges() const { @@ -143,15 +140,18 @@ void MachineDominatorTree::applySplitCriticalEdges() const { } void MachineDominatorTree::verifyDomTree() const { + if (!DT) + return; MachineFunction &F = *getRoot()->getParent(); - MachineDominatorTree OtherDT; - OtherDT.DT->recalculate(F); - if (compare(OtherDT)) { + DomTreeBase<MachineBasicBlock> OtherDT; + OtherDT.recalculate(F); + if (getRootNode()->getBlock() != OtherDT.getRootNode()->getBlock() || + DT->compare(OtherDT)) { errs() << "MachineDominatorTree is not up to date!\nComputed:\n"; - print(errs(), nullptr); + DT->print(errs()); errs() << "\nActual:\n"; - OtherDT.print(errs(), nullptr); + OtherDT.print(errs()); abort(); } } diff --git a/contrib/llvm/lib/CodeGen/MachineFrameInfo.cpp b/contrib/llvm/lib/CodeGen/MachineFrameInfo.cpp new file mode 100644 index 0000000..73d778f --- /dev/null +++ b/contrib/llvm/lib/CodeGen/MachineFrameInfo.cpp @@ -0,0 +1,244 @@ +//===-- MachineFrameInfo.cpp ---------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file Implements MachineFrameInfo that manages the stack frame. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineFrameInfo.h" + +#include "llvm/ADT/BitVector.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <cassert> + +#define DEBUG_TYPE "codegen" + +using namespace llvm; + +void MachineFrameInfo::ensureMaxAlignment(unsigned Align) { + if (!StackRealignable) + assert(Align <= StackAlignment && + "For targets without stack realignment, Align is out of limit!"); + if (MaxAlignment < Align) MaxAlignment = Align; +} + +/// Clamp the alignment if requested and emit a warning. +static inline unsigned clampStackAlignment(bool ShouldClamp, unsigned Align, + unsigned StackAlign) { + if (!ShouldClamp || Align <= StackAlign) + return Align; + DEBUG(dbgs() << "Warning: requested alignment " << Align + << " exceeds the stack alignment " << StackAlign + << " when stack realignment is off" << '\n'); + return StackAlign; +} + +int MachineFrameInfo::CreateStackObject(uint64_t Size, unsigned Alignment, + bool isSS, const AllocaInst *Alloca) { + assert(Size != 0 && "Cannot allocate zero size stack objects!"); + Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment); + Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, Alloca, + !isSS)); + int Index = (int)Objects.size() - NumFixedObjects - 1; + assert(Index >= 0 && "Bad frame index!"); + ensureMaxAlignment(Alignment); + return Index; +} + +int MachineFrameInfo::CreateSpillStackObject(uint64_t Size, + unsigned Alignment) { + Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment); + CreateStackObject(Size, Alignment, true); + int Index = (int)Objects.size() - NumFixedObjects - 1; + ensureMaxAlignment(Alignment); + return Index; +} + +int MachineFrameInfo::CreateVariableSizedObject(unsigned Alignment, + const AllocaInst *Alloca) { + HasVarSizedObjects = true; + Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment); + Objects.push_back(StackObject(0, Alignment, 0, false, false, Alloca, true)); + ensureMaxAlignment(Alignment); + return (int)Objects.size()-NumFixedObjects-1; +} + +int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset, + bool Immutable, bool isAliased) { + assert(Size != 0 && "Cannot allocate zero size fixed stack objects!"); + // The alignment of the frame index can be determined from its offset from + // the incoming frame position. If the frame object is at offset 32 and + // the stack is guaranteed to be 16-byte aligned, then we know that the + // object is 16-byte aligned. Note that unlike the non-fixed case, if the + // stack needs realignment, we can't assume that the stack will in fact be + // aligned. + unsigned Align = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment); + Align = clampStackAlignment(!StackRealignable, Align, StackAlignment); + Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable, + /*isSS*/ false, + /*Alloca*/ nullptr, isAliased)); + return -++NumFixedObjects; +} + +int MachineFrameInfo::CreateFixedSpillStackObject(uint64_t Size, + int64_t SPOffset, + bool Immutable) { + unsigned Align = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment); + Align = clampStackAlignment(!StackRealignable, Align, StackAlignment); + Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable, + /*isSS*/ true, + /*Alloca*/ nullptr, + /*isAliased*/ false)); + return -++NumFixedObjects; +} + +BitVector MachineFrameInfo::getPristineRegs(const MachineFunction &MF) const { + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + BitVector BV(TRI->getNumRegs()); + + // Before CSI is calculated, no registers are considered pristine. They can be + // freely used and PEI will make sure they are saved. + if (!isCalleeSavedInfoValid()) + return BV; + + const MachineRegisterInfo &MRI = MF.getRegInfo(); + for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR; + ++CSR) + BV.set(*CSR); + + // Saved CSRs are not pristine. + for (auto &I : getCalleeSavedInfo()) + for (MCSubRegIterator S(I.getReg(), TRI, true); S.isValid(); ++S) + BV.reset(*S); + + return BV; +} + +unsigned MachineFrameInfo::estimateStackSize(const MachineFunction &MF) const { + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); + unsigned MaxAlign = getMaxAlignment(); + int Offset = 0; + + // This code is very, very similar to PEI::calculateFrameObjectOffsets(). + // It really should be refactored to share code. Until then, changes + // should keep in mind that there's tight coupling between the two. + + for (int i = getObjectIndexBegin(); i != 0; ++i) { + int FixedOff = -getObjectOffset(i); + if (FixedOff > Offset) Offset = FixedOff; + } + for (unsigned i = 0, e = getObjectIndexEnd(); i != e; ++i) { + if (isDeadObjectIndex(i)) + continue; + Offset += getObjectSize(i); + unsigned Align = getObjectAlignment(i); + // Adjust to alignment boundary + Offset = (Offset+Align-1)/Align*Align; + + MaxAlign = std::max(Align, MaxAlign); + } + + if (adjustsStack() && TFI->hasReservedCallFrame(MF)) + Offset += getMaxCallFrameSize(); + + // Round up the size to a multiple of the alignment. If the function has + // any calls or alloca's, align to the target's StackAlignment value to + // ensure that the callee's frame or the alloca data is suitably aligned; + // otherwise, for leaf functions, align to the TransientStackAlignment + // value. + unsigned StackAlign; + if (adjustsStack() || hasVarSizedObjects() || + (RegInfo->needsStackRealignment(MF) && getObjectIndexEnd() != 0)) + StackAlign = TFI->getStackAlignment(); + else + StackAlign = TFI->getTransientStackAlignment(); + + // If the frame pointer is eliminated, all frame offsets will be relative to + // SP not FP. Align to MaxAlign so this works. + StackAlign = std::max(StackAlign, MaxAlign); + unsigned AlignMask = StackAlign - 1; + Offset = (Offset + AlignMask) & ~uint64_t(AlignMask); + + return (unsigned)Offset; +} + +void MachineFrameInfo::computeMaxCallFrameSize(const MachineFunction &MF) { + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode(); + unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode(); + assert(FrameSetupOpcode != ~0u && FrameDestroyOpcode != ~0u && + "Can only compute MaxCallFrameSize if Setup/Destroy opcode are known"); + + MaxCallFrameSize = 0; + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + unsigned Opcode = MI.getOpcode(); + if (Opcode == FrameSetupOpcode || Opcode == FrameDestroyOpcode) { + unsigned Size = TII.getFrameSize(MI); + MaxCallFrameSize = std::max(MaxCallFrameSize, Size); + AdjustsStack = true; + } else if (MI.isInlineAsm()) { + // Some inline asm's need a stack frame, as indicated by operand 1. + unsigned ExtraInfo = MI.getOperand(InlineAsm::MIOp_ExtraInfo).getImm(); + if (ExtraInfo & InlineAsm::Extra_IsAlignStack) + AdjustsStack = true; + } + } + } +} + +void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{ + if (Objects.empty()) return; + + const TargetFrameLowering *FI = MF.getSubtarget().getFrameLowering(); + int ValOffset = (FI ? FI->getOffsetOfLocalArea() : 0); + + OS << "Frame Objects:\n"; + + for (unsigned i = 0, e = Objects.size(); i != e; ++i) { + const StackObject &SO = Objects[i]; + OS << " fi#" << (int)(i-NumFixedObjects) << ": "; + if (SO.Size == ~0ULL) { + OS << "dead\n"; + continue; + } + if (SO.Size == 0) + OS << "variable sized"; + else + OS << "size=" << SO.Size; + OS << ", align=" << SO.Alignment; + + if (i < NumFixedObjects) + OS << ", fixed"; + if (i < NumFixedObjects || SO.SPOffset != -1) { + int64_t Off = SO.SPOffset - ValOffset; + OS << ", at location [SP"; + if (Off > 0) + OS << "+" << Off; + else if (Off < 0) + OS << Off; + OS << "]"; + } + OS << "\n"; + } +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void MachineFrameInfo::dump(const MachineFunction &MF) const { + print(MF, dbgs()); +} +#endif diff --git a/contrib/llvm/lib/CodeGen/MachineFunction.cpp b/contrib/llvm/lib/CodeGen/MachineFunction.cpp index c1d5ea9..742b095 100644 --- a/contrib/llvm/lib/CodeGen/MachineFunction.cpp +++ b/contrib/llvm/lib/CodeGen/MachineFunction.cpp @@ -20,7 +20,6 @@ #include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunctionInitializer.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" @@ -52,8 +51,6 @@ static cl::opt<unsigned> cl::desc("Force the alignment of all functions."), cl::init(0), cl::Hidden); -void MachineFunctionInitializer::anchor() {} - static const char *getPropertyName(MachineFunctionProperties::Property Prop) { typedef MachineFunctionProperties::Property P; switch(Prop) { @@ -169,6 +166,7 @@ void MachineFunction::clear() { InstructionRecycler.clear(Allocator); OperandRecycler.clear(Allocator); BasicBlockRecycler.clear(Allocator); + VariableDbgInfos.clear(); if (RegInfo) { RegInfo->~MachineRegisterInfo(); Allocator.Deallocate(RegInfo); @@ -307,11 +305,11 @@ MachineFunction::DeleteMachineBasicBlock(MachineBasicBlock *MBB) { MachineMemOperand *MachineFunction::getMachineMemOperand( MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, unsigned base_alignment, const AAMDNodes &AAInfo, const MDNode *Ranges, - SynchronizationScope SynchScope, AtomicOrdering Ordering, + SyncScope::ID SSID, AtomicOrdering Ordering, AtomicOrdering FailureOrdering) { return new (Allocator) MachineMemOperand(PtrInfo, f, s, base_alignment, AAInfo, Ranges, - SynchScope, Ordering, FailureOrdering); + SSID, Ordering, FailureOrdering); } MachineMemOperand * @@ -322,13 +320,27 @@ MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO, MachineMemOperand(MachinePointerInfo(MMO->getValue(), MMO->getOffset()+Offset), MMO->getFlags(), Size, MMO->getBaseAlignment(), - AAMDNodes(), nullptr, MMO->getSynchScope(), + AAMDNodes(), nullptr, MMO->getSyncScopeID(), MMO->getOrdering(), MMO->getFailureOrdering()); return new (Allocator) MachineMemOperand(MachinePointerInfo(MMO->getPseudoValue(), MMO->getOffset()+Offset), MMO->getFlags(), Size, MMO->getBaseAlignment(), - AAMDNodes(), nullptr, MMO->getSynchScope(), + AAMDNodes(), nullptr, MMO->getSyncScopeID(), + MMO->getOrdering(), MMO->getFailureOrdering()); +} + +MachineMemOperand * +MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO, + const AAMDNodes &AAInfo) { + MachinePointerInfo MPI = MMO->getValue() ? + MachinePointerInfo(MMO->getValue(), MMO->getOffset()) : + MachinePointerInfo(MMO->getPseudoValue(), MMO->getOffset()); + + return new (Allocator) + MachineMemOperand(MPI, MMO->getFlags(), MMO->getSize(), + MMO->getBaseAlignment(), AAInfo, + MMO->getRanges(), MMO->getSyncScopeID(), MMO->getOrdering(), MMO->getFailureOrdering()); } @@ -361,7 +373,7 @@ MachineFunction::extractLoadMemRefs(MachineInstr::mmo_iterator Begin, (*I)->getFlags() & ~MachineMemOperand::MOStore, (*I)->getSize(), (*I)->getBaseAlignment(), (*I)->getAAInfo(), nullptr, - (*I)->getSynchScope(), (*I)->getOrdering(), + (*I)->getSyncScopeID(), (*I)->getOrdering(), (*I)->getFailureOrdering()); Result[Index] = JustLoad; } @@ -395,7 +407,7 @@ MachineFunction::extractStoreMemRefs(MachineInstr::mmo_iterator Begin, (*I)->getFlags() & ~MachineMemOperand::MOLoad, (*I)->getSize(), (*I)->getBaseAlignment(), (*I)->getAAInfo(), nullptr, - (*I)->getSynchScope(), (*I)->getOrdering(), + (*I)->getSyncScopeID(), (*I)->getOrdering(), (*I)->getFailureOrdering()); Result[Index] = JustStore; } @@ -756,212 +768,6 @@ void llvm::addLandingPadInfo(const LandingPadInst &I, MachineBasicBlock &MBB) { /// \} //===----------------------------------------------------------------------===// -// MachineFrameInfo implementation -//===----------------------------------------------------------------------===// - -/// Make sure the function is at least Align bytes aligned. -void MachineFrameInfo::ensureMaxAlignment(unsigned Align) { - if (!StackRealignable) - assert(Align <= StackAlignment && - "For targets without stack realignment, Align is out of limit!"); - if (MaxAlignment < Align) MaxAlignment = Align; -} - -/// Clamp the alignment if requested and emit a warning. -static inline unsigned clampStackAlignment(bool ShouldClamp, unsigned Align, - unsigned StackAlign) { - if (!ShouldClamp || Align <= StackAlign) - return Align; - DEBUG(dbgs() << "Warning: requested alignment " << Align - << " exceeds the stack alignment " << StackAlign - << " when stack realignment is off" << '\n'); - return StackAlign; -} - -/// Create a new statically sized stack object, returning a nonnegative -/// identifier to represent it. -int MachineFrameInfo::CreateStackObject(uint64_t Size, unsigned Alignment, - bool isSS, const AllocaInst *Alloca) { - assert(Size != 0 && "Cannot allocate zero size stack objects!"); - Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment); - Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, Alloca, - !isSS)); - int Index = (int)Objects.size() - NumFixedObjects - 1; - assert(Index >= 0 && "Bad frame index!"); - ensureMaxAlignment(Alignment); - return Index; -} - -/// Create a new statically sized stack object that represents a spill slot, -/// returning a nonnegative identifier to represent it. -int MachineFrameInfo::CreateSpillStackObject(uint64_t Size, - unsigned Alignment) { - Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment); - CreateStackObject(Size, Alignment, true); - int Index = (int)Objects.size() - NumFixedObjects - 1; - ensureMaxAlignment(Alignment); - return Index; -} - -/// Notify the MachineFrameInfo object that a variable sized object has been -/// created. This must be created whenever a variable sized object is created, -/// whether or not the index returned is actually used. -int MachineFrameInfo::CreateVariableSizedObject(unsigned Alignment, - const AllocaInst *Alloca) { - HasVarSizedObjects = true; - Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment); - Objects.push_back(StackObject(0, Alignment, 0, false, false, Alloca, true)); - ensureMaxAlignment(Alignment); - return (int)Objects.size()-NumFixedObjects-1; -} - -/// Create a new object at a fixed location on the stack. -/// All fixed objects should be created before other objects are created for -/// efficiency. By default, fixed objects are immutable. This returns an -/// index with a negative value. -int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset, - bool Immutable, bool isAliased) { - assert(Size != 0 && "Cannot allocate zero size fixed stack objects!"); - // The alignment of the frame index can be determined from its offset from - // the incoming frame position. If the frame object is at offset 32 and - // the stack is guaranteed to be 16-byte aligned, then we know that the - // object is 16-byte aligned. Note that unlike the non-fixed case, if the - // stack needs realignment, we can't assume that the stack will in fact be - // aligned. - unsigned Align = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment); - Align = clampStackAlignment(!StackRealignable, Align, StackAlignment); - Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable, - /*isSS*/ false, - /*Alloca*/ nullptr, isAliased)); - return -++NumFixedObjects; -} - -/// Create a spill slot at a fixed location on the stack. -/// Returns an index with a negative value. -int MachineFrameInfo::CreateFixedSpillStackObject(uint64_t Size, - int64_t SPOffset, - bool Immutable) { - unsigned Align = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment); - Align = clampStackAlignment(!StackRealignable, Align, StackAlignment); - Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable, - /*isSS*/ true, - /*Alloca*/ nullptr, - /*isAliased*/ false)); - return -++NumFixedObjects; -} - -BitVector MachineFrameInfo::getPristineRegs(const MachineFunction &MF) const { - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - BitVector BV(TRI->getNumRegs()); - - // Before CSI is calculated, no registers are considered pristine. They can be - // freely used and PEI will make sure they are saved. - if (!isCalleeSavedInfoValid()) - return BV; - - for (const MCPhysReg *CSR = TRI->getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR) - BV.set(*CSR); - - // Saved CSRs are not pristine. - for (auto &I : getCalleeSavedInfo()) - for (MCSubRegIterator S(I.getReg(), TRI, true); S.isValid(); ++S) - BV.reset(*S); - - return BV; -} - -unsigned MachineFrameInfo::estimateStackSize(const MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); - const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); - unsigned MaxAlign = getMaxAlignment(); - int Offset = 0; - - // This code is very, very similar to PEI::calculateFrameObjectOffsets(). - // It really should be refactored to share code. Until then, changes - // should keep in mind that there's tight coupling between the two. - - for (int i = getObjectIndexBegin(); i != 0; ++i) { - int FixedOff = -getObjectOffset(i); - if (FixedOff > Offset) Offset = FixedOff; - } - for (unsigned i = 0, e = getObjectIndexEnd(); i != e; ++i) { - if (isDeadObjectIndex(i)) - continue; - Offset += getObjectSize(i); - unsigned Align = getObjectAlignment(i); - // Adjust to alignment boundary - Offset = (Offset+Align-1)/Align*Align; - - MaxAlign = std::max(Align, MaxAlign); - } - - if (adjustsStack() && TFI->hasReservedCallFrame(MF)) - Offset += getMaxCallFrameSize(); - - // Round up the size to a multiple of the alignment. If the function has - // any calls or alloca's, align to the target's StackAlignment value to - // ensure that the callee's frame or the alloca data is suitably aligned; - // otherwise, for leaf functions, align to the TransientStackAlignment - // value. - unsigned StackAlign; - if (adjustsStack() || hasVarSizedObjects() || - (RegInfo->needsStackRealignment(MF) && getObjectIndexEnd() != 0)) - StackAlign = TFI->getStackAlignment(); - else - StackAlign = TFI->getTransientStackAlignment(); - - // If the frame pointer is eliminated, all frame offsets will be relative to - // SP not FP. Align to MaxAlign so this works. - StackAlign = std::max(StackAlign, MaxAlign); - unsigned AlignMask = StackAlign - 1; - Offset = (Offset + AlignMask) & ~uint64_t(AlignMask); - - return (unsigned)Offset; -} - -void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{ - if (Objects.empty()) return; - - const TargetFrameLowering *FI = MF.getSubtarget().getFrameLowering(); - int ValOffset = (FI ? FI->getOffsetOfLocalArea() : 0); - - OS << "Frame Objects:\n"; - - for (unsigned i = 0, e = Objects.size(); i != e; ++i) { - const StackObject &SO = Objects[i]; - OS << " fi#" << (int)(i-NumFixedObjects) << ": "; - if (SO.Size == ~0ULL) { - OS << "dead\n"; - continue; - } - if (SO.Size == 0) - OS << "variable sized"; - else - OS << "size=" << SO.Size; - OS << ", align=" << SO.Alignment; - - if (i < NumFixedObjects) - OS << ", fixed"; - if (i < NumFixedObjects || SO.SPOffset != -1) { - int64_t Off = SO.SPOffset - ValOffset; - OS << ", at location [SP"; - if (Off > 0) - OS << "+" << Off; - else if (Off < 0) - OS << Off; - OS << "]"; - } - OS << "\n"; - } -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void MachineFrameInfo::dump(const MachineFunction &MF) const { - print(MF, dbgs()); -} -#endif - -//===----------------------------------------------------------------------===// // MachineJumpTableInfo implementation //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp b/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp index 2265676..5ffe330 100644 --- a/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp +++ b/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp @@ -42,7 +42,7 @@ bool MachineFunctionPass::runOnFunction(Function &F) { return false; MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>(); - MachineFunction &MF = MMI.getMachineFunction(F); + MachineFunction &MF = MMI.getOrCreateMachineFunction(F); MachineFunctionProperties &MFProps = MF.getProperties(); diff --git a/contrib/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp b/contrib/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp index 0d533c3..55d9def 100644 --- a/contrib/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp +++ b/contrib/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp @@ -11,9 +11,9 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" diff --git a/contrib/llvm/lib/CodeGen/MachineInstr.cpp b/contrib/llvm/lib/CodeGen/MachineInstr.cpp index 2f2e3b3..535757e 100644 --- a/contrib/llvm/lib/CodeGen/MachineInstr.cpp +++ b/contrib/llvm/lib/CodeGen/MachineInstr.cpp @@ -1,4 +1,4 @@ -//===-- lib/CodeGen/MachineInstr.cpp --------------------------------------===// +//===- lib/CodeGen/MachineInstr.cpp ---------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -12,20 +12,34 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/Hashing.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/MemoryLocation.h" +#include "llvm/CodeGen/GlobalISel/RegisterBank.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineInstrBundle.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" @@ -34,10 +48,14 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/LowLevelTypeImpl.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" @@ -45,6 +63,14 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <iterator> +#include <utility> + using namespace llvm; static cl::opt<bool> PrintWholeRegMask( @@ -256,14 +282,27 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const { case MachineOperand::MO_GlobalAddress: return getGlobal() == Other.getGlobal() && getOffset() == Other.getOffset(); case MachineOperand::MO_ExternalSymbol: - return !strcmp(getSymbolName(), Other.getSymbolName()) && + return strcmp(getSymbolName(), Other.getSymbolName()) == 0 && getOffset() == Other.getOffset(); case MachineOperand::MO_BlockAddress: return getBlockAddress() == Other.getBlockAddress() && getOffset() == Other.getOffset(); case MachineOperand::MO_RegisterMask: - case MachineOperand::MO_RegisterLiveOut: - return getRegMask() == Other.getRegMask(); + case MachineOperand::MO_RegisterLiveOut: { + // Shallow compare of the two RegMasks + const uint32_t *RegMask = getRegMask(); + const uint32_t *OtherRegMask = Other.getRegMask(); + if (RegMask == OtherRegMask) + return true; + + // Calculate the size of the RegMask + const MachineFunction *MF = getParent()->getParent()->getParent(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32; + + // Deep compare of the two RegMasks + return std::equal(RegMask, RegMask + RegMaskSize, OtherRegMask); + } case MachineOperand::MO_MCSymbol: return getMCSymbol() == Other.getMCSymbol(); case MachineOperand::MO_CFIIndex: @@ -403,6 +442,19 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST, bool Unused; APF.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &Unused); OS << "half " << APF.convertToFloat(); + } else if (getFPImm()->getType()->isFP128Ty()) { + APFloat APF = getFPImm()->getValueAPF(); + SmallString<16> Str; + getFPImm()->getValueAPF().toString(Str); + OS << "quad " << Str; + } else if (getFPImm()->getType()->isX86_FP80Ty()) { + APFloat APF = getFPImm()->getValueAPF(); + OS << "x86_fp80 0xK"; + APInt API = APF.bitcastToAPInt(); + OS << format_hex_no_prefix(API.getHiBits(16).getZExtValue(), 4, + /*Upper=*/true); + OS << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16, + /*Upper=*/true); } else { OS << getFPImm()->getValueAPF().convertToDouble(); } @@ -491,6 +543,7 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST, auto Pred = static_cast<CmpInst::Predicate>(getPredicate()); OS << '<' << (CmpInst::isIntPredicate(Pred) ? "intpred" : "floatpred") << CmpInst::getPredicateName(Pred) << '>'; + break; } } if (unsigned TF = getTargetFlags()) @@ -514,6 +567,21 @@ unsigned MachinePointerInfo::getAddrSpace() const { return cast<PointerType>(V.get<const Value*>()->getType())->getAddressSpace(); } +/// isDereferenceable - Return true if V is always dereferenceable for +/// Offset + Size byte. +bool MachinePointerInfo::isDereferenceable(unsigned Size, LLVMContext &C, + const DataLayout &DL) const { + if (!V.is<const Value*>()) + return false; + + const Value *BasePtr = V.get<const Value*>(); + if (BasePtr == nullptr) + return false; + + return isDereferenceableAndAlignedPointer( + BasePtr, 1, APInt(DL.getPointerSizeInBits(), Offset + Size), DL); +} + /// getConstantPool - Return a MachinePointerInfo record that refers to the /// constant pool. MachinePointerInfo MachinePointerInfo::getConstantPool(MachineFunction &MF) { @@ -544,7 +612,7 @@ MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, Flags f, uint64_t s, unsigned int a, const AAMDNodes &AAInfo, const MDNode *Ranges, - SynchronizationScope SynchScope, + SyncScope::ID SSID, AtomicOrdering Ordering, AtomicOrdering FailureOrdering) : PtrInfo(ptrinfo), Size(s), FlagVals(f), BaseAlignLog2(Log2_32(a) + 1), @@ -555,8 +623,8 @@ MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, Flags f, assert(getBaseAlignment() == a && "Alignment is not a power of 2!"); assert((isLoad() || isStore()) && "Not a load/store!"); - AtomicInfo.SynchScope = static_cast<unsigned>(SynchScope); - assert(getSynchScope() == SynchScope && "Value truncated"); + AtomicInfo.SSID = static_cast<unsigned>(SSID); + assert(getSyncScopeID() == SSID && "Value truncated"); AtomicInfo.Ordering = static_cast<unsigned>(Ordering); assert(getOrdering() == Ordering && "Value truncated"); AtomicInfo.FailureOrdering = static_cast<unsigned>(FailureOrdering); @@ -682,6 +750,12 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST) const { OS << "(dereferenceable)"; if (isInvariant()) OS << "(invariant)"; + if (getFlags() & MOTargetFlag1) + OS << "(flag1)"; + if (getFlags() & MOTargetFlag2) + OS << "(flag2)"; + if (getFlags() & MOTargetFlag3) + OS << "(flag3)"; } //===----------------------------------------------------------------------===// @@ -704,9 +778,7 @@ void MachineInstr::addImplicitDefUseOperands(MachineFunction &MF) { /// the MCInstrDesc. MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid, DebugLoc dl, bool NoImp) - : MCID(&tid), Parent(nullptr), Operands(nullptr), NumOperands(0), Flags(0), - AsmPrinterFlags(0), NumMemRefs(0), MemRefs(nullptr), - debugLoc(std::move(dl)) { + : MCID(&tid), debugLoc(std::move(dl)) { assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor"); // Reserve space for the expected number of operands. @@ -723,9 +795,8 @@ MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid, /// MachineInstr ctor - Copies MachineInstr arg exactly /// MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI) - : MCID(&MI.getDesc()), Parent(nullptr), Operands(nullptr), NumOperands(0), - Flags(0), AsmPrinterFlags(0), NumMemRefs(MI.NumMemRefs), - MemRefs(MI.MemRefs), debugLoc(MI.getDebugLoc()) { + : MCID(&MI.getDesc()), NumMemRefs(MI.NumMemRefs), MemRefs(MI.MemRefs), + debugLoc(MI.getDebugLoc()) { assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor"); CapOperands = OperandCapacity::get(MI.getNumOperands()); @@ -1571,6 +1642,65 @@ bool MachineInstr::isSafeToMove(AliasAnalysis *AA, bool &SawStore) const { return true; } +bool MachineInstr::mayAlias(AliasAnalysis *AA, MachineInstr &Other, + bool UseTBAA) { + const MachineFunction *MF = getParent()->getParent(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + + // If neither instruction stores to memory, they can't alias in any + // meaningful way, even if they read from the same address. + if (!mayStore() && !Other.mayStore()) + return false; + + // Let the target decide if memory accesses cannot possibly overlap. + if (TII->areMemAccessesTriviallyDisjoint(*this, Other, AA)) + return false; + + if (!AA) + return true; + + // FIXME: Need to handle multiple memory operands to support all targets. + if (!hasOneMemOperand() || !Other.hasOneMemOperand()) + return true; + + MachineMemOperand *MMOa = *memoperands_begin(); + MachineMemOperand *MMOb = *Other.memoperands_begin(); + + if (!MMOa->getValue() || !MMOb->getValue()) + return true; + + // The following interface to AA is fashioned after DAGCombiner::isAlias + // and operates with MachineMemOperand offset with some important + // assumptions: + // - LLVM fundamentally assumes flat address spaces. + // - MachineOperand offset can *only* result from legalization and + // cannot affect queries other than the trivial case of overlap + // checking. + // - These offsets never wrap and never step outside + // of allocated objects. + // - There should never be any negative offsets here. + // + // FIXME: Modify API to hide this math from "user" + // FIXME: Even before we go to AA we can reason locally about some + // memory objects. It can save compile time, and possibly catch some + // corner cases not currently covered. + + assert((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset"); + assert((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset"); + + int64_t MinOffset = std::min(MMOa->getOffset(), MMOb->getOffset()); + int64_t Overlapa = MMOa->getSize() + MMOa->getOffset() - MinOffset; + int64_t Overlapb = MMOb->getSize() + MMOb->getOffset() - MinOffset; + + AliasResult AAResult = + AA->alias(MemoryLocation(MMOa->getValue(), Overlapa, + UseTBAA ? MMOa->getAAInfo() : AAMDNodes()), + MemoryLocation(MMOb->getValue(), Overlapb, + UseTBAA ? MMOb->getAAInfo() : AAMDNodes())); + + return (AAResult != NoAlias); +} + /// hasOrderedMemoryRef - Return true if this instruction may have an ordered /// or volatile memory reference, or if the information describing the memory /// reference is not available. Return false if it is known to have no ordered @@ -1589,7 +1719,7 @@ bool MachineInstr::hasOrderedMemoryRef() const { return true; // Check if any of our memory operands are ordered. - return any_of(memoperands(), [](const MachineMemOperand *MMO) { + return llvm::any_of(memoperands(), [](const MachineMemOperand *MMO) { return !MMO->isUnordered(); }); } @@ -1692,14 +1822,14 @@ void MachineInstr::copyImplicitOps(MachineFunction &MF, } } -LLVM_DUMP_METHOD void MachineInstr::dump(const TargetInstrInfo *TII) const { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void MachineInstr::dump() const { dbgs() << " "; - print(dbgs(), false /* SkipOpers */, TII); -#endif + print(dbgs()); } +#endif -void MachineInstr::print(raw_ostream &OS, bool SkipOpers, +void MachineInstr::print(raw_ostream &OS, bool SkipOpers, bool SkipDebugLoc, const TargetInstrInfo *TII) const { const Module *M = nullptr; if (const MachineBasicBlock *MBB = getParent()) @@ -1707,11 +1837,12 @@ void MachineInstr::print(raw_ostream &OS, bool SkipOpers, M = MF->getFunction()->getParent(); ModuleSlotTracker MST(M); - print(OS, MST, SkipOpers, TII); + print(OS, MST, SkipOpers, SkipDebugLoc, TII); } void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, - bool SkipOpers, const TargetInstrInfo *TII) const { + bool SkipOpers, bool SkipDebugLoc, + const TargetInstrInfo *TII) const { // We can be a bit tidier if we know the MachineFunction. const MachineFunction *MF = nullptr; const TargetRegisterInfo *TRI = nullptr; @@ -1762,7 +1893,6 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, return; // Print the rest of the operands. - bool OmittedAnyCallClobbers = false; bool FirstOp = true; unsigned AsmDescOp = ~0u; unsigned AsmOpCount = 0; @@ -1799,31 +1929,6 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) VirtRegs.push_back(MO.getReg()); - // Omit call-clobbered registers which aren't used anywhere. This makes - // call instructions much less noisy on targets where calls clobber lots - // of registers. Don't rely on MO.isDead() because we may be called before - // LiveVariables is run, or we may be looking at a non-allocatable reg. - if (MRI && isCall() && - MO.isReg() && MO.isImplicit() && MO.isDef()) { - unsigned Reg = MO.getReg(); - if (TargetRegisterInfo::isPhysicalRegister(Reg)) { - if (MRI->use_empty(Reg)) { - bool HasAliasLive = false; - for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) { - unsigned AliasReg = *AI; - if (!MRI->use_empty(AliasReg)) { - HasAliasLive = true; - break; - } - } - if (!HasAliasLive) { - OmittedAnyCallClobbers = true; - continue; - } - } - } - } - if (FirstOp) FirstOp = false; else OS << ","; OS << " "; if (i < getDesc().NumOperands) { @@ -1905,12 +2010,6 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, MO.print(OS, MST, TRI); } - // Briefly indicate whether any call clobbers were omitted. - if (OmittedAnyCallClobbers) { - if (!FirstOp) OS << ","; - OS << " ..."; - } - bool HaveSemi = false; const unsigned PrintableFlags = FrameSetup | FrameDestroy; if (Flags & PrintableFlags) { @@ -1987,6 +2086,8 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, } if (isIndirectDebugValue()) OS << " indirect"; + } else if (SkipDebugLoc) { + return; } else if (debugLoc && MF) { if (!HaveSemi) OS << ";"; @@ -2174,8 +2275,8 @@ void MachineInstr::setPhysRegsDeadExcept(ArrayRef<unsigned> UsedRegs, unsigned Reg = MO.getReg(); if (!TargetRegisterInfo::isPhysicalRegister(Reg)) continue; // If there are no uses, including partial uses, the def is dead. - if (none_of(UsedRegs, - [&](unsigned Use) { return TRI.regsOverlap(Use, Reg); })) + if (llvm::none_of(UsedRegs, + [&](unsigned Use) { return TRI.regsOverlap(Use, Reg); })) MO.setIsDead(); } @@ -2263,3 +2364,26 @@ MachineInstrBuilder llvm::BuildMI(MachineBasicBlock &BB, BB.insert(I, MI); return MachineInstrBuilder(MF, MI); } + +MachineInstr *llvm::buildDbgValueForSpill(MachineBasicBlock &BB, + MachineBasicBlock::iterator I, + const MachineInstr &Orig, + int FrameIndex) { + const MDNode *Var = Orig.getDebugVariable(); + const auto *Expr = cast_or_null<DIExpression>(Orig.getDebugExpression()); + bool IsIndirect = Orig.isIndirectDebugValue(); + uint64_t Offset = IsIndirect ? Orig.getOperand(1).getImm() : 0; + DebugLoc DL = Orig.getDebugLoc(); + assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) && + "Expected inlined-at fields to agree"); + // If the DBG_VALUE already was a memory location, add an extra + // DW_OP_deref. Otherwise just turning this from a register into a + // memory/indirect location is sufficient. + if (IsIndirect) + Expr = DIExpression::prepend(Expr, DIExpression::WithDeref); + return BuildMI(BB, I, DL, Orig.getDesc()) + .addFrameIndex(FrameIndex) + .addImm(Offset) + .addMetadata(Var) + .addMetadata(Expr); +} diff --git a/contrib/llvm/lib/CodeGen/MachineLICM.cpp b/contrib/llvm/lib/CodeGen/MachineLICM.cpp index b3d1843..c7113f1 100644 --- a/contrib/llvm/lib/CodeGen/MachineLICM.cpp +++ b/contrib/llvm/lib/CodeGen/MachineLICM.cpp @@ -16,7 +16,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" @@ -26,6 +25,7 @@ #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/TargetSchedule.h" #include "llvm/Support/CommandLine.h" @@ -38,7 +38,7 @@ #include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; -#define DEBUG_TYPE "machine-licm" +#define DEBUG_TYPE "machinelicm" static cl::opt<bool> AvoidSpeculation("avoid-speculation", @@ -237,13 +237,13 @@ namespace { char MachineLICM::ID = 0; char &llvm::MachineLICMID = MachineLICM::ID; -INITIALIZE_PASS_BEGIN(MachineLICM, "machinelicm", - "Machine Loop Invariant Code Motion", false, false) +INITIALIZE_PASS_BEGIN(MachineLICM, DEBUG_TYPE, + "Machine Loop Invariant Code Motion", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(MachineLICM, "machinelicm", - "Machine Loop Invariant Code Motion", false, false) +INITIALIZE_PASS_END(MachineLICM, DEBUG_TYPE, + "Machine Loop Invariant Code Motion", false, false) /// Test if the given loop is the outer-most loop that has a unique predecessor. static bool LoopIsOuterMostWithPredecessor(MachineLoop *CurLoop) { @@ -330,7 +330,7 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) { /// Return true if instruction stores to the specified frame. static bool InstructionStoresToFI(const MachineInstr *MI, int FI) { // If we lost memory operands, conservatively assume that the instruction - // writes to all slots. + // writes to all slots. if (MI->memoperands_empty()) return true; for (const MachineMemOperand *MemOp : MI->memoperands()) { @@ -708,7 +708,7 @@ void MachineLICM::SinkIntoLoop() { for (MachineBasicBlock::instr_iterator I = Preheader->instr_begin(); I != Preheader->instr_end(); ++I) { // We need to ensure that we can safely move this instruction into the loop. - // As such, it must not have side-effects, e.g. such as a call has. + // As such, it must not have side-effects, e.g. such as a call has. if (IsLoopInvariantInst(*I) && !HasLoopPHIUse(&*I)) Candidates.push_back(&*I); } @@ -837,9 +837,9 @@ MachineLICM::calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen, /// constant pool. static bool mayLoadFromGOTOrConstantPool(MachineInstr &MI) { assert (MI.mayLoad() && "Expected MI that loads!"); - + // If we lost memory operands, conservatively assume that the instruction - // reads from everything.. + // reads from everything.. if (MI.memoperands_empty()) return true; @@ -895,8 +895,11 @@ bool MachineLICM::IsLoopInvariantInst(MachineInstr &I) { // If the physreg has no defs anywhere, it's just an ambient register // and we can freely move its uses. Alternatively, if it's allocatable, // it could get allocated to something with a def during allocation. - if (!MRI->isConstantPhysReg(Reg)) - return false; + // However, if the physreg is known to always be caller saved/restored + // then this use is safe to hoist. + if (!MRI->isConstantPhysReg(Reg) && + !(TRI->isCallerPreservedPhysReg(Reg, *I.getParent()->getParent()))) + return false; // Otherwise it's safe to move. continue; } else if (!MO.isDead()) { @@ -1337,7 +1340,7 @@ bool MachineLICM::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) { Preheader->splice(Preheader->getFirstTerminator(),MI->getParent(),MI); // Since we are moving the instruction out of its basic block, we do not - // retain its debug location. Doing so would degrade the debugging + // retain its debug location. Doing so would degrade the debugging // experience and adversely affect the accuracy of profiling information. MI->setDebugLoc(DebugLoc()); diff --git a/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp b/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp index fdeaf7b..a9aa1d9 100644 --- a/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp +++ b/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp @@ -87,6 +87,22 @@ MachineBasicBlock *MachineLoop::findLoopControlBlock() { return nullptr; } +DebugLoc MachineLoop::getStartLoc() const { + // Try the pre-header first. + if (MachineBasicBlock *PHeadMBB = getLoopPreheader()) + if (const BasicBlock *PHeadBB = PHeadMBB->getBasicBlock()) + if (DebugLoc DL = PHeadBB->getTerminator()->getDebugLoc()) + return DL; + + // If we have no pre-header or there are no instructions with debug + // info in it, try the header. + if (MachineBasicBlock *HeadMBB = getHeader()) + if (const BasicBlock *HeadBB = HeadMBB->getBasicBlock()) + return HeadBB->getTerminator()->getDebugLoc(); + + return DebugLoc(); +} + MachineBasicBlock * MachineLoopInfo::findLoopPreheader(MachineLoop *L, bool SpeculativePreheader) const { diff --git a/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp b/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp index 6618857..825290a 100644 --- a/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp +++ b/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp @@ -8,43 +8,51 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/ADT/PointerUnion.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/TinyPtrVector.h" -#include "llvm/Analysis/EHPersonalities.h" -#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionInitializer.h" -#include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/IR/Constants.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" -#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Support/Dwarf.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" +#include <algorithm> +#include <cassert> +#include <memory> +#include <utility> +#include <vector> + using namespace llvm; using namespace llvm::dwarf; // Handle the Pass registration stuff necessary to use DataLayout's. -INITIALIZE_TM_PASS(MachineModuleInfo, "machinemoduleinfo", - "Machine Module Information", false, false) +INITIALIZE_PASS(MachineModuleInfo, "machinemoduleinfo", + "Machine Module Information", false, false) char MachineModuleInfo::ID = 0; // Out of line virtual method. -MachineModuleInfoImpl::~MachineModuleInfoImpl() {} +MachineModuleInfoImpl::~MachineModuleInfoImpl() = default; namespace llvm { + class MMIAddrLabelMapCallbackPtr final : CallbackVH { - MMIAddrLabelMap *Map; + MMIAddrLabelMap *Map = nullptr; + public: - MMIAddrLabelMapCallbackPtr() : Map(nullptr) {} - MMIAddrLabelMapCallbackPtr(Value *V) : CallbackVH(V), Map(nullptr) {} + MMIAddrLabelMapCallbackPtr() = default; + MMIAddrLabelMapCallbackPtr(Value *V) : CallbackVH(V) {} void setPtr(BasicBlock *BB) { ValueHandleBase::operator=(BB); @@ -75,11 +83,12 @@ class MMIAddrLabelMap { /// This is a per-function list of symbols whose corresponding BasicBlock got /// deleted. These symbols need to be emitted at some point in the file, so /// AsmPrinter emits them after the function body. - DenseMap<AssertingVH<Function>, std::vector<MCSymbol*> > + DenseMap<AssertingVH<Function>, std::vector<MCSymbol*>> DeletedAddrLabelsNeedingEmission; -public: +public: MMIAddrLabelMap(MCContext &context) : Context(context) {} + ~MMIAddrLabelMap() { assert(DeletedAddrLabelsNeedingEmission.empty() && "Some labels for deleted blocks never got emitted"); @@ -93,7 +102,8 @@ public: void UpdateForDeletedBlock(BasicBlock *BB); void UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New); }; -} + +} // end namespace llvm ArrayRef<MCSymbol *> MMIAddrLabelMap::getAddrLabelSymbolToEmit(BasicBlock *BB) { assert(BB->hasAddressTaken() && @@ -119,7 +129,7 @@ ArrayRef<MCSymbol *> MMIAddrLabelMap::getAddrLabelSymbolToEmit(BasicBlock *BB) { /// If we have any deleted symbols for F, return them. void MMIAddrLabelMap:: takeDeletedSymbolsForFunction(Function *F, std::vector<MCSymbol*> &Result) { - DenseMap<AssertingVH<Function>, std::vector<MCSymbol*> >::iterator I = + DenseMap<AssertingVH<Function>, std::vector<MCSymbol*>>::iterator I = DeletedAddrLabelsNeedingEmission.find(F); // If there are no entries for the function, just return. @@ -130,7 +140,6 @@ takeDeletedSymbolsForFunction(Function *F, std::vector<MCSymbol*> &Result) { DeletedAddrLabelsNeedingEmission.erase(I); } - void MMIAddrLabelMap::UpdateForDeletedBlock(BasicBlock *BB) { // If the block got deleted, there is no need for the symbol. If the symbol // was already emitted, we can just forget about it, otherwise we need to @@ -177,7 +186,6 @@ void MMIAddrLabelMap::UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New) { OldEntry.Symbols.end()); } - void MMIAddrLabelMapCallbackPtr::deleted() { Map->UpdateForDeletedBlock(cast<BasicBlock>(getValPtr())); } @@ -186,9 +194,6 @@ void MMIAddrLabelMapCallbackPtr::allUsesReplacedWith(Value *V2) { Map->UpdateForRAUWBlock(cast<BasicBlock>(getValPtr()), cast<BasicBlock>(V2)); } - -//===----------------------------------------------------------------------===// - MachineModuleInfo::MachineModuleInfo(const TargetMachine *TM) : ImmutablePass(ID), TM(*TM), Context(TM->getMCAsmInfo(), TM->getMCRegisterInfo(), @@ -196,11 +201,9 @@ MachineModuleInfo::MachineModuleInfo(const TargetMachine *TM) initializeMachineModuleInfoPass(*PassRegistry::getPassRegistry()); } -MachineModuleInfo::~MachineModuleInfo() { -} +MachineModuleInfo::~MachineModuleInfo() = default; bool MachineModuleInfo::doInitialization(Module &M) { - ObjFileMMI = nullptr; CurCallSite = 0; DbgInfoAvailable = UsesVAFloatArgument = UsesMorestackAddr = false; @@ -211,7 +214,6 @@ bool MachineModuleInfo::doInitialization(Module &M) { } bool MachineModuleInfo::doFinalization(Module &M) { - Personalities.clear(); delete AddrLabelSymbols; @@ -256,7 +258,14 @@ void MachineModuleInfo::addPersonality(const Function *Personality) { /// \} -MachineFunction &MachineModuleInfo::getMachineFunction(const Function &F) { +MachineFunction * +MachineModuleInfo::getMachineFunction(const Function &F) const { + auto I = MachineFunctions.find(&F); + return I != MachineFunctions.end() ? I->second.get() : nullptr; +} + +MachineFunction & +MachineModuleInfo::getOrCreateMachineFunction(const Function &F) { // Shortcut for the common case where a sequence of MachineFunctionPasses // all query for the same Function. if (LastRequest == &F) @@ -270,10 +279,6 @@ MachineFunction &MachineModuleInfo::getMachineFunction(const Function &F) { MF = new MachineFunction(&F, TM, NextFnNum++, *this); // Update the set entry. I.first->second.reset(MF); - - if (MFInitializer) - if (MFInitializer->initializeMachineFunction(*MF)) - report_fatal_error("Unable to initialize machine function"); } else { MF = I.first->second.get(); } @@ -290,10 +295,12 @@ void MachineModuleInfo::deleteMachineFunctionFor(Function &F) { } namespace { + /// This pass frees the MachineFunction object associated with a Function. class FreeMachineFunction : public FunctionPass { public: static char ID; + FreeMachineFunction() : FunctionPass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -306,15 +313,19 @@ public: MMI.deleteMachineFunctionFor(F); return true; } + + StringRef getPassName() const override { + return "Free MachineFunction"; + } }; -char FreeMachineFunction::ID; + } // end anonymous namespace -namespace llvm { -FunctionPass *createFreeMachineFunctionPass() { +char FreeMachineFunction::ID; + +FunctionPass *llvm::createFreeMachineFunctionPass() { return new FreeMachineFunction(); } -} // end namespace llvm //===- MMI building helpers -----------------------------------------------===// diff --git a/contrib/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp b/contrib/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp new file mode 100644 index 0000000..73c3428 --- /dev/null +++ b/contrib/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp @@ -0,0 +1,108 @@ +///===- MachineOptimizationRemarkEmitter.cpp - Opt Diagnostic -*- C++ -*---===// +/// +/// The LLVM Compiler Infrastructure +/// +/// This file is distributed under the University of Illinois Open Source +/// License. See LICENSE.TXT for details. +/// +///===---------------------------------------------------------------------===// +/// \file +/// Optimization diagnostic interfaces for machine passes. It's packaged as an +/// analysis pass so that by using this service passes become dependent on MBFI +/// as well. MBFI is used to compute the "hotness" of the diagnostic message. +/// +///===---------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/LLVMContext.h" + +using namespace llvm; + +DiagnosticInfoMIROptimization::MachineArgument::MachineArgument( + StringRef MKey, const MachineInstr &MI) + : Argument() { + Key = MKey; + + raw_string_ostream OS(Val); + MI.print(OS, /*SkipOpers=*/false, /*SkipDebugLoc=*/true); +} + +Optional<uint64_t> +MachineOptimizationRemarkEmitter::computeHotness(const MachineBasicBlock &MBB) { + if (!MBFI) + return None; + + return MBFI->getBlockProfileCount(&MBB); +} + +void MachineOptimizationRemarkEmitter::computeHotness( + DiagnosticInfoMIROptimization &Remark) { + const MachineBasicBlock *MBB = Remark.getBlock(); + if (MBB) + Remark.setHotness(computeHotness(*MBB)); +} + +void MachineOptimizationRemarkEmitter::emit( + DiagnosticInfoOptimizationBase &OptDiagCommon) { + auto &OptDiag = cast<DiagnosticInfoMIROptimization>(OptDiagCommon); + computeHotness(OptDiag); + + LLVMContext &Ctx = MF.getFunction()->getContext(); + + // If a diagnostic has a hotness value, then only emit it if its hotness + // meets the threshold. + if (OptDiag.getHotness() && + *OptDiag.getHotness() < Ctx.getDiagnosticsHotnessThreshold()) { + return; + } + + yaml::Output *Out = Ctx.getDiagnosticsOutputFile(); + if (Out) { + auto *P = &const_cast<DiagnosticInfoOptimizationBase &>(OptDiagCommon); + *Out << P; + } + // FIXME: now that IsVerbose is part of DI, filtering for this will be moved + // from here to clang. + if (!OptDiag.isVerbose() || shouldEmitVerbose()) + Ctx.diagnose(OptDiag); +} + +MachineOptimizationRemarkEmitterPass::MachineOptimizationRemarkEmitterPass() + : MachineFunctionPass(ID) { + initializeMachineOptimizationRemarkEmitterPassPass( + *PassRegistry::getPassRegistry()); +} + +bool MachineOptimizationRemarkEmitterPass::runOnMachineFunction( + MachineFunction &MF) { + MachineBlockFrequencyInfo *MBFI; + + if (MF.getFunction()->getContext().getDiagnosticsHotnessRequested()) + MBFI = &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI(); + else + MBFI = nullptr; + + ORE = llvm::make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI); + return false; +} + +void MachineOptimizationRemarkEmitterPass::getAnalysisUsage( + AnalysisUsage &AU) const { + AU.addRequired<LazyMachineBlockFrequencyInfoPass>(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +char MachineOptimizationRemarkEmitterPass::ID = 0; +static const char ore_name[] = "Machine Optimization Remark Emitter"; +#define ORE_NAME "machine-opt-remark-emitter" + +INITIALIZE_PASS_BEGIN(MachineOptimizationRemarkEmitterPass, ORE_NAME, ore_name, + false, true) +INITIALIZE_PASS_DEPENDENCY(LazyMachineBlockFrequencyInfoPass) +INITIALIZE_PASS_END(MachineOptimizationRemarkEmitterPass, ORE_NAME, ore_name, + false, true) diff --git a/contrib/llvm/lib/CodeGen/MachineOutliner.cpp b/contrib/llvm/lib/CodeGen/MachineOutliner.cpp new file mode 100644 index 0000000..fd6b242 --- /dev/null +++ b/contrib/llvm/lib/CodeGen/MachineOutliner.cpp @@ -0,0 +1,1251 @@ +//===---- MachineOutliner.cpp - Outline instructions -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Replaces repeated sequences of instructions with function calls. +/// +/// This works by placing every instruction from every basic block in a +/// suffix tree, and repeatedly querying that tree for repeated sequences of +/// instructions. If a sequence of instructions appears often, then it ought +/// to be beneficial to pull out into a function. +/// +/// This was originally presented at the 2016 LLVM Developers' Meeting in the +/// talk "Reducing Code Size Using Outlining". For a high-level overview of +/// how this pass works, the talk is available on YouTube at +/// +/// https://www.youtube.com/watch?v=yorld-WSOeU +/// +/// The slides for the talk are available at +/// +/// http://www.llvm.org/devmtg/2016-11/Slides/Paquette-Outliner.pdf +/// +/// The talk provides an overview of how the outliner finds candidates and +/// ultimately outlines them. It describes how the main data structure for this +/// pass, the suffix tree, is queried and purged for candidates. It also gives +/// a simplified suffix tree construction algorithm for suffix trees based off +/// of the algorithm actually used here, Ukkonen's algorithm. +/// +/// For the original RFC for this pass, please see +/// +/// http://lists.llvm.org/pipermail/llvm-dev/2016-August/104170.html +/// +/// For more information on the suffix tree data structure, please see +/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf +/// +//===----------------------------------------------------------------------===// +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/Twine.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <functional> +#include <map> +#include <sstream> +#include <tuple> +#include <vector> + +#define DEBUG_TYPE "machine-outliner" + +using namespace llvm; + +STATISTIC(NumOutlined, "Number of candidates outlined"); +STATISTIC(FunctionsCreated, "Number of functions created"); + +namespace { + +/// \brief An individual sequence of instructions to be replaced with a call to +/// an outlined function. +struct Candidate { + + /// Set to false if the candidate overlapped with another candidate. + bool InCandidateList = true; + + /// The start index of this \p Candidate. + size_t StartIdx; + + /// The number of instructions in this \p Candidate. + size_t Len; + + /// The index of this \p Candidate's \p OutlinedFunction in the list of + /// \p OutlinedFunctions. + size_t FunctionIdx; + + /// \brief The number of instructions that would be saved by outlining every + /// candidate of this type. + /// + /// This is a fixed value which is not updated during the candidate pruning + /// process. It is only used for deciding which candidate to keep if two + /// candidates overlap. The true benefit is stored in the OutlinedFunction + /// for some given candidate. + unsigned Benefit = 0; + + Candidate(size_t StartIdx, size_t Len, size_t FunctionIdx) + : StartIdx(StartIdx), Len(Len), FunctionIdx(FunctionIdx) {} + + Candidate() {} + + /// \brief Used to ensure that \p Candidates are outlined in an order that + /// preserves the start and end indices of other \p Candidates. + bool operator<(const Candidate &RHS) const { return StartIdx > RHS.StartIdx; } +}; + +/// \brief The information necessary to create an outlined function for some +/// class of candidate. +struct OutlinedFunction { + + /// The actual outlined function created. + /// This is initialized after we go through and create the actual function. + MachineFunction *MF = nullptr; + + /// A number assigned to this function which appears at the end of its name. + size_t Name; + + /// The number of candidates for this OutlinedFunction. + size_t OccurrenceCount = 0; + + /// \brief The sequence of integers corresponding to the instructions in this + /// function. + std::vector<unsigned> Sequence; + + /// The number of instructions this function would save. + unsigned Benefit = 0; + + /// \brief Set to true if candidates for this outlined function should be + /// replaced with tail calls to this OutlinedFunction. + bool IsTailCall = false; + + OutlinedFunction(size_t Name, size_t OccurrenceCount, + const std::vector<unsigned> &Sequence, + unsigned Benefit, bool IsTailCall) + : Name(Name), OccurrenceCount(OccurrenceCount), Sequence(Sequence), + Benefit(Benefit), IsTailCall(IsTailCall) + {} +}; + +/// Represents an undefined index in the suffix tree. +const size_t EmptyIdx = -1; + +/// A node in a suffix tree which represents a substring or suffix. +/// +/// Each node has either no children or at least two children, with the root +/// being a exception in the empty tree. +/// +/// Children are represented as a map between unsigned integers and nodes. If +/// a node N has a child M on unsigned integer k, then the mapping represented +/// by N is a proper prefix of the mapping represented by M. Note that this, +/// although similar to a trie is somewhat different: each node stores a full +/// substring of the full mapping rather than a single character state. +/// +/// Each internal node contains a pointer to the internal node representing +/// the same string, but with the first character chopped off. This is stored +/// in \p Link. Each leaf node stores the start index of its respective +/// suffix in \p SuffixIdx. +struct SuffixTreeNode { + + /// The children of this node. + /// + /// A child existing on an unsigned integer implies that from the mapping + /// represented by the current node, there is a way to reach another + /// mapping by tacking that character on the end of the current string. + DenseMap<unsigned, SuffixTreeNode *> Children; + + /// A flag set to false if the node has been pruned from the tree. + bool IsInTree = true; + + /// The start index of this node's substring in the main string. + size_t StartIdx = EmptyIdx; + + /// The end index of this node's substring in the main string. + /// + /// Every leaf node must have its \p EndIdx incremented at the end of every + /// step in the construction algorithm. To avoid having to update O(N) + /// nodes individually at the end of every step, the end index is stored + /// as a pointer. + size_t *EndIdx = nullptr; + + /// For leaves, the start index of the suffix represented by this node. + /// + /// For all other nodes, this is ignored. + size_t SuffixIdx = EmptyIdx; + + /// \brief For internal nodes, a pointer to the internal node representing + /// the same sequence with the first character chopped off. + /// + /// This has two major purposes in the suffix tree. The first is as a + /// shortcut in Ukkonen's construction algorithm. One of the things that + /// Ukkonen's algorithm does to achieve linear-time construction is + /// keep track of which node the next insert should be at. This makes each + /// insert O(1), and there are a total of O(N) inserts. The suffix link + /// helps with inserting children of internal nodes. + /// + /// Say we add a child to an internal node with associated mapping S. The + /// next insertion must be at the node representing S - its first character. + /// This is given by the way that we iteratively build the tree in Ukkonen's + /// algorithm. The main idea is to look at the suffixes of each prefix in the + /// string, starting with the longest suffix of the prefix, and ending with + /// the shortest. Therefore, if we keep pointers between such nodes, we can + /// move to the next insertion point in O(1) time. If we don't, then we'd + /// have to query from the root, which takes O(N) time. This would make the + /// construction algorithm O(N^2) rather than O(N). + /// + /// The suffix link is also used during the tree pruning process to let us + /// quickly throw out a bunch of potential overlaps. Say we have a sequence + /// S we want to outline. Then each of its suffixes contribute to at least + /// one overlapping case. Therefore, we can follow the suffix links + /// starting at the node associated with S to the root and "delete" those + /// nodes, save for the root. For each candidate, this removes + /// O(|candidate|) overlaps from the search space. We don't actually + /// completely invalidate these nodes though; doing that is far too + /// aggressive. Consider the following pathological string: + /// + /// 1 2 3 1 2 3 2 3 2 3 2 3 2 3 2 3 2 3 + /// + /// If we, for the sake of example, outlined 1 2 3, then we would throw + /// out all instances of 2 3. This isn't desirable. To get around this, + /// when we visit a link node, we decrement its occurrence count by the + /// number of sequences we outlined in the current step. In the pathological + /// example, the 2 3 node would have an occurrence count of 8, while the + /// 1 2 3 node would have an occurrence count of 2. Thus, the 2 3 node + /// would survive to the next round allowing us to outline the extra + /// instances of 2 3. + SuffixTreeNode *Link = nullptr; + + /// The parent of this node. Every node except for the root has a parent. + SuffixTreeNode *Parent = nullptr; + + /// The number of times this node's string appears in the tree. + /// + /// This is equal to the number of leaf children of the string. It represents + /// the number of suffixes that the node's string is a prefix of. + size_t OccurrenceCount = 0; + + /// The length of the string formed by concatenating the edge labels from the + /// root to this node. + size_t ConcatLen = 0; + + /// Returns true if this node is a leaf. + bool isLeaf() const { return SuffixIdx != EmptyIdx; } + + /// Returns true if this node is the root of its owning \p SuffixTree. + bool isRoot() const { return StartIdx == EmptyIdx; } + + /// Return the number of elements in the substring associated with this node. + size_t size() const { + + // Is it the root? If so, it's the empty string so return 0. + if (isRoot()) + return 0; + + assert(*EndIdx != EmptyIdx && "EndIdx is undefined!"); + + // Size = the number of elements in the string. + // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1. + return *EndIdx - StartIdx + 1; + } + + SuffixTreeNode(size_t StartIdx, size_t *EndIdx, SuffixTreeNode *Link, + SuffixTreeNode *Parent) + : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link), Parent(Parent) {} + + SuffixTreeNode() {} +}; + +/// A data structure for fast substring queries. +/// +/// Suffix trees represent the suffixes of their input strings in their leaves. +/// A suffix tree is a type of compressed trie structure where each node +/// represents an entire substring rather than a single character. Each leaf +/// of the tree is a suffix. +/// +/// A suffix tree can be seen as a type of state machine where each state is a +/// substring of the full string. The tree is structured so that, for a string +/// of length N, there are exactly N leaves in the tree. This structure allows +/// us to quickly find repeated substrings of the input string. +/// +/// In this implementation, a "string" is a vector of unsigned integers. +/// These integers may result from hashing some data type. A suffix tree can +/// contain 1 or many strings, which can then be queried as one large string. +/// +/// The suffix tree is implemented using Ukkonen's algorithm for linear-time +/// suffix tree construction. Ukkonen's algorithm is explained in more detail +/// in the paper by Esko Ukkonen "On-line construction of suffix trees. The +/// paper is available at +/// +/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf +class SuffixTree { +private: + /// Each element is an integer representing an instruction in the module. + ArrayRef<unsigned> Str; + + /// Maintains each node in the tree. + SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator; + + /// The root of the suffix tree. + /// + /// The root represents the empty string. It is maintained by the + /// \p NodeAllocator like every other node in the tree. + SuffixTreeNode *Root = nullptr; + + /// Stores each leaf node in the tree. + /// + /// This is used for finding outlining candidates. + std::vector<SuffixTreeNode *> LeafVector; + + /// Maintains the end indices of the internal nodes in the tree. + /// + /// Each internal node is guaranteed to never have its end index change + /// during the construction algorithm; however, leaves must be updated at + /// every step. Therefore, we need to store leaf end indices by reference + /// to avoid updating O(N) leaves at every step of construction. Thus, + /// every internal node must be allocated its own end index. + BumpPtrAllocator InternalEndIdxAllocator; + + /// The end index of each leaf in the tree. + size_t LeafEndIdx = -1; + + /// \brief Helper struct which keeps track of the next insertion point in + /// Ukkonen's algorithm. + struct ActiveState { + /// The next node to insert at. + SuffixTreeNode *Node; + + /// The index of the first character in the substring currently being added. + size_t Idx = EmptyIdx; + + /// The length of the substring we have to add at the current step. + size_t Len = 0; + }; + + /// \brief The point the next insertion will take place at in the + /// construction algorithm. + ActiveState Active; + + /// Allocate a leaf node and add it to the tree. + /// + /// \param Parent The parent of this node. + /// \param StartIdx The start index of this node's associated string. + /// \param Edge The label on the edge leaving \p Parent to this node. + /// + /// \returns A pointer to the allocated leaf node. + SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, size_t StartIdx, + unsigned Edge) { + + assert(StartIdx <= LeafEndIdx && "String can't start after it ends!"); + + SuffixTreeNode *N = new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, + &LeafEndIdx, + nullptr, + &Parent); + Parent.Children[Edge] = N; + + return N; + } + + /// Allocate an internal node and add it to the tree. + /// + /// \param Parent The parent of this node. Only null when allocating the root. + /// \param StartIdx The start index of this node's associated string. + /// \param EndIdx The end index of this node's associated string. + /// \param Edge The label on the edge leaving \p Parent to this node. + /// + /// \returns A pointer to the allocated internal node. + SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, size_t StartIdx, + size_t EndIdx, unsigned Edge) { + + assert(StartIdx <= EndIdx && "String can't start after it ends!"); + assert(!(!Parent && StartIdx != EmptyIdx) && + "Non-root internal nodes must have parents!"); + + size_t *E = new (InternalEndIdxAllocator) size_t(EndIdx); + SuffixTreeNode *N = new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, + E, + Root, + Parent); + if (Parent) + Parent->Children[Edge] = N; + + return N; + } + + /// \brief Set the suffix indices of the leaves to the start indices of their + /// respective suffixes. Also stores each leaf in \p LeafVector at its + /// respective suffix index. + /// + /// \param[in] CurrNode The node currently being visited. + /// \param CurrIdx The current index of the string being visited. + void setSuffixIndices(SuffixTreeNode &CurrNode, size_t CurrIdx) { + + bool IsLeaf = CurrNode.Children.size() == 0 && !CurrNode.isRoot(); + + // Store the length of the concatenation of all strings from the root to + // this node. + if (!CurrNode.isRoot()) { + if (CurrNode.ConcatLen == 0) + CurrNode.ConcatLen = CurrNode.size(); + + if (CurrNode.Parent) + CurrNode.ConcatLen += CurrNode.Parent->ConcatLen; + } + + // Traverse the tree depth-first. + for (auto &ChildPair : CurrNode.Children) { + assert(ChildPair.second && "Node had a null child!"); + setSuffixIndices(*ChildPair.second, + CurrIdx + ChildPair.second->size()); + } + + // Is this node a leaf? + if (IsLeaf) { + // If yes, give it a suffix index and bump its parent's occurrence count. + CurrNode.SuffixIdx = Str.size() - CurrIdx; + assert(CurrNode.Parent && "CurrNode had no parent!"); + CurrNode.Parent->OccurrenceCount++; + + // Store the leaf in the leaf vector for pruning later. + LeafVector[CurrNode.SuffixIdx] = &CurrNode; + } + } + + /// \brief Construct the suffix tree for the prefix of the input ending at + /// \p EndIdx. + /// + /// Used to construct the full suffix tree iteratively. At the end of each + /// step, the constructed suffix tree is either a valid suffix tree, or a + /// suffix tree with implicit suffixes. At the end of the final step, the + /// suffix tree is a valid tree. + /// + /// \param EndIdx The end index of the current prefix in the main string. + /// \param SuffixesToAdd The number of suffixes that must be added + /// to complete the suffix tree at the current phase. + /// + /// \returns The number of suffixes that have not been added at the end of + /// this step. + unsigned extend(size_t EndIdx, size_t SuffixesToAdd) { + SuffixTreeNode *NeedsLink = nullptr; + + while (SuffixesToAdd > 0) { + + // Are we waiting to add anything other than just the last character? + if (Active.Len == 0) { + // If not, then say the active index is the end index. + Active.Idx = EndIdx; + } + + assert(Active.Idx <= EndIdx && "Start index can't be after end index!"); + + // The first character in the current substring we're looking at. + unsigned FirstChar = Str[Active.Idx]; + + // Have we inserted anything starting with FirstChar at the current node? + if (Active.Node->Children.count(FirstChar) == 0) { + // If not, then we can just insert a leaf and move too the next step. + insertLeaf(*Active.Node, EndIdx, FirstChar); + + // The active node is an internal node, and we visited it, so it must + // need a link if it doesn't have one. + if (NeedsLink) { + NeedsLink->Link = Active.Node; + NeedsLink = nullptr; + } + } else { + // There's a match with FirstChar, so look for the point in the tree to + // insert a new node. + SuffixTreeNode *NextNode = Active.Node->Children[FirstChar]; + + size_t SubstringLen = NextNode->size(); + + // Is the current suffix we're trying to insert longer than the size of + // the child we want to move to? + if (Active.Len >= SubstringLen) { + // If yes, then consume the characters we've seen and move to the next + // node. + Active.Idx += SubstringLen; + Active.Len -= SubstringLen; + Active.Node = NextNode; + continue; + } + + // Otherwise, the suffix we're trying to insert must be contained in the + // next node we want to move to. + unsigned LastChar = Str[EndIdx]; + + // Is the string we're trying to insert a substring of the next node? + if (Str[NextNode->StartIdx + Active.Len] == LastChar) { + // If yes, then we're done for this step. Remember our insertion point + // and move to the next end index. At this point, we have an implicit + // suffix tree. + if (NeedsLink && !Active.Node->isRoot()) { + NeedsLink->Link = Active.Node; + NeedsLink = nullptr; + } + + Active.Len++; + break; + } + + // The string we're trying to insert isn't a substring of the next node, + // but matches up to a point. Split the node. + // + // For example, say we ended our search at a node n and we're trying to + // insert ABD. Then we'll create a new node s for AB, reduce n to just + // representing C, and insert a new leaf node l to represent d. This + // allows us to ensure that if n was a leaf, it remains a leaf. + // + // | ABC ---split---> | AB + // n s + // C / \ D + // n l + + // The node s from the diagram + SuffixTreeNode *SplitNode = + insertInternalNode(Active.Node, + NextNode->StartIdx, + NextNode->StartIdx + Active.Len - 1, + FirstChar); + + // Insert the new node representing the new substring into the tree as + // a child of the split node. This is the node l from the diagram. + insertLeaf(*SplitNode, EndIdx, LastChar); + + // Make the old node a child of the split node and update its start + // index. This is the node n from the diagram. + NextNode->StartIdx += Active.Len; + NextNode->Parent = SplitNode; + SplitNode->Children[Str[NextNode->StartIdx]] = NextNode; + + // SplitNode is an internal node, update the suffix link. + if (NeedsLink) + NeedsLink->Link = SplitNode; + + NeedsLink = SplitNode; + } + + // We've added something new to the tree, so there's one less suffix to + // add. + SuffixesToAdd--; + + if (Active.Node->isRoot()) { + if (Active.Len > 0) { + Active.Len--; + Active.Idx = EndIdx - SuffixesToAdd + 1; + } + } else { + // Start the next phase at the next smallest suffix. + Active.Node = Active.Node->Link; + } + } + + return SuffixesToAdd; + } + +public: + + /// Find all repeated substrings that satisfy \p BenefitFn. + /// + /// If a substring appears at least twice, then it must be represented by + /// an internal node which appears in at least two suffixes. Each suffix is + /// represented by a leaf node. To do this, we visit each internal node in + /// the tree, using the leaf children of each internal node. If an internal + /// node represents a beneficial substring, then we use each of its leaf + /// children to find the locations of its substring. + /// + /// \param[out] CandidateList Filled with candidates representing each + /// beneficial substring. + /// \param[out] FunctionList Filled with a list of \p OutlinedFunctions each + /// type of candidate. + /// \param BenefitFn The function to satisfy. + /// + /// \returns The length of the longest candidate found. + size_t findCandidates(std::vector<Candidate> &CandidateList, + std::vector<OutlinedFunction> &FunctionList, + const std::function<unsigned(SuffixTreeNode &, size_t, unsigned)> + &BenefitFn) { + + CandidateList.clear(); + FunctionList.clear(); + size_t FnIdx = 0; + size_t MaxLen = 0; + + for (SuffixTreeNode* Leaf : LeafVector) { + assert(Leaf && "Leaves in LeafVector cannot be null!"); + if (!Leaf->IsInTree) + continue; + + assert(Leaf->Parent && "All leaves must have parents!"); + SuffixTreeNode &Parent = *(Leaf->Parent); + + // If it doesn't appear enough, or we already outlined from it, skip it. + if (Parent.OccurrenceCount < 2 || Parent.isRoot() || !Parent.IsInTree) + continue; + + size_t StringLen = Leaf->ConcatLen - Leaf->size(); + + // How many instructions would outlining this string save? + unsigned Benefit = BenefitFn(Parent, + StringLen, Str[Leaf->SuffixIdx + StringLen - 1]); + + // If it's not beneficial, skip it. + if (Benefit < 1) + continue; + + if (StringLen > MaxLen) + MaxLen = StringLen; + + unsigned OccurrenceCount = 0; + for (auto &ChildPair : Parent.Children) { + SuffixTreeNode *M = ChildPair.second; + + // Is it a leaf? If so, we have an occurrence of this candidate. + if (M && M->IsInTree && M->isLeaf()) { + OccurrenceCount++; + CandidateList.emplace_back(M->SuffixIdx, StringLen, FnIdx); + CandidateList.back().Benefit = Benefit; + M->IsInTree = false; + } + } + + // Save the function for the new candidate sequence. + std::vector<unsigned> CandidateSequence; + for (unsigned i = Leaf->SuffixIdx; i < Leaf->SuffixIdx + StringLen; i++) + CandidateSequence.push_back(Str[i]); + + FunctionList.emplace_back(FnIdx, OccurrenceCount, CandidateSequence, + Benefit, false); + + // Move to the next function. + FnIdx++; + Parent.IsInTree = false; + } + + return MaxLen; + } + + /// Construct a suffix tree from a sequence of unsigned integers. + /// + /// \param Str The string to construct the suffix tree for. + SuffixTree(const std::vector<unsigned> &Str) : Str(Str) { + Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0); + Root->IsInTree = true; + Active.Node = Root; + LeafVector = std::vector<SuffixTreeNode*>(Str.size()); + + // Keep track of the number of suffixes we have to add of the current + // prefix. + size_t SuffixesToAdd = 0; + Active.Node = Root; + + // Construct the suffix tree iteratively on each prefix of the string. + // PfxEndIdx is the end index of the current prefix. + // End is one past the last element in the string. + for (size_t PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) { + SuffixesToAdd++; + LeafEndIdx = PfxEndIdx; // Extend each of the leaves. + SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd); + } + + // Set the suffix indices of each leaf. + assert(Root && "Root node can't be nullptr!"); + setSuffixIndices(*Root, 0); + } +}; + +/// \brief Maps \p MachineInstrs to unsigned integers and stores the mappings. +struct InstructionMapper { + + /// \brief The next available integer to assign to a \p MachineInstr that + /// cannot be outlined. + /// + /// Set to -3 for compatability with \p DenseMapInfo<unsigned>. + unsigned IllegalInstrNumber = -3; + + /// \brief The next available integer to assign to a \p MachineInstr that can + /// be outlined. + unsigned LegalInstrNumber = 0; + + /// Correspondence from \p MachineInstrs to unsigned integers. + DenseMap<MachineInstr *, unsigned, MachineInstrExpressionTrait> + InstructionIntegerMap; + + /// Corresponcence from unsigned integers to \p MachineInstrs. + /// Inverse of \p InstructionIntegerMap. + DenseMap<unsigned, MachineInstr *> IntegerInstructionMap; + + /// The vector of unsigned integers that the module is mapped to. + std::vector<unsigned> UnsignedVec; + + /// \brief Stores the location of the instruction associated with the integer + /// at index i in \p UnsignedVec for each index i. + std::vector<MachineBasicBlock::iterator> InstrList; + + /// \brief Maps \p *It to a legal integer. + /// + /// Updates \p InstrList, \p UnsignedVec, \p InstructionIntegerMap, + /// \p IntegerInstructionMap, and \p LegalInstrNumber. + /// + /// \returns The integer that \p *It was mapped to. + unsigned mapToLegalUnsigned(MachineBasicBlock::iterator &It) { + + // Get the integer for this instruction or give it the current + // LegalInstrNumber. + InstrList.push_back(It); + MachineInstr &MI = *It; + bool WasInserted; + DenseMap<MachineInstr *, unsigned, MachineInstrExpressionTrait>::iterator + ResultIt; + std::tie(ResultIt, WasInserted) = + InstructionIntegerMap.insert(std::make_pair(&MI, LegalInstrNumber)); + unsigned MINumber = ResultIt->second; + + // There was an insertion. + if (WasInserted) { + LegalInstrNumber++; + IntegerInstructionMap.insert(std::make_pair(MINumber, &MI)); + } + + UnsignedVec.push_back(MINumber); + + // Make sure we don't overflow or use any integers reserved by the DenseMap. + if (LegalInstrNumber >= IllegalInstrNumber) + report_fatal_error("Instruction mapping overflow!"); + + assert(LegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() + && "Tried to assign DenseMap tombstone or empty key to instruction."); + assert(LegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() + && "Tried to assign DenseMap tombstone or empty key to instruction."); + + return MINumber; + } + + /// Maps \p *It to an illegal integer. + /// + /// Updates \p InstrList, \p UnsignedVec, and \p IllegalInstrNumber. + /// + /// \returns The integer that \p *It was mapped to. + unsigned mapToIllegalUnsigned(MachineBasicBlock::iterator &It) { + unsigned MINumber = IllegalInstrNumber; + + InstrList.push_back(It); + UnsignedVec.push_back(IllegalInstrNumber); + IllegalInstrNumber--; + + assert(LegalInstrNumber < IllegalInstrNumber && + "Instruction mapping overflow!"); + + assert(IllegalInstrNumber != + DenseMapInfo<unsigned>::getEmptyKey() && + "IllegalInstrNumber cannot be DenseMap tombstone or empty key!"); + + assert(IllegalInstrNumber != + DenseMapInfo<unsigned>::getTombstoneKey() && + "IllegalInstrNumber cannot be DenseMap tombstone or empty key!"); + + return MINumber; + } + + /// \brief Transforms a \p MachineBasicBlock into a \p vector of \p unsigneds + /// and appends it to \p UnsignedVec and \p InstrList. + /// + /// Two instructions are assigned the same integer if they are identical. + /// If an instruction is deemed unsafe to outline, then it will be assigned an + /// unique integer. The resulting mapping is placed into a suffix tree and + /// queried for candidates. + /// + /// \param MBB The \p MachineBasicBlock to be translated into integers. + /// \param TRI \p TargetRegisterInfo for the module. + /// \param TII \p TargetInstrInfo for the module. + void convertToUnsignedVec(MachineBasicBlock &MBB, + const TargetRegisterInfo &TRI, + const TargetInstrInfo &TII) { + for (MachineBasicBlock::iterator It = MBB.begin(), Et = MBB.end(); It != Et; + It++) { + + // Keep track of where this instruction is in the module. + switch(TII.getOutliningType(*It)) { + case TargetInstrInfo::MachineOutlinerInstrType::Illegal: + mapToIllegalUnsigned(It); + break; + + case TargetInstrInfo::MachineOutlinerInstrType::Legal: + mapToLegalUnsigned(It); + break; + + case TargetInstrInfo::MachineOutlinerInstrType::Invisible: + break; + } + } + + // After we're done every insertion, uniquely terminate this part of the + // "string". This makes sure we won't match across basic block or function + // boundaries since the "end" is encoded uniquely and thus appears in no + // repeated substring. + InstrList.push_back(MBB.end()); + UnsignedVec.push_back(IllegalInstrNumber); + IllegalInstrNumber--; + } + + InstructionMapper() { + // Make sure that the implementation of DenseMapInfo<unsigned> hasn't + // changed. + assert(DenseMapInfo<unsigned>::getEmptyKey() == (unsigned)-1 && + "DenseMapInfo<unsigned>'s empty key isn't -1!"); + assert(DenseMapInfo<unsigned>::getTombstoneKey() == (unsigned)-2 && + "DenseMapInfo<unsigned>'s tombstone key isn't -2!"); + } +}; + +/// \brief An interprocedural pass which finds repeated sequences of +/// instructions and replaces them with calls to functions. +/// +/// Each instruction is mapped to an unsigned integer and placed in a string. +/// The resulting mapping is then placed in a \p SuffixTree. The \p SuffixTree +/// is then repeatedly queried for repeated sequences of instructions. Each +/// non-overlapping repeated sequence is then placed in its own +/// \p MachineFunction and each instance is then replaced with a call to that +/// function. +struct MachineOutliner : public ModulePass { + + static char ID; + + StringRef getPassName() const override { return "Machine Outliner"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineModuleInfo>(); + AU.addPreserved<MachineModuleInfo>(); + AU.setPreservesAll(); + ModulePass::getAnalysisUsage(AU); + } + + MachineOutliner() : ModulePass(ID) { + initializeMachineOutlinerPass(*PassRegistry::getPassRegistry()); + } + + /// \brief Replace the sequences of instructions represented by the + /// \p Candidates in \p CandidateList with calls to \p MachineFunctions + /// described in \p FunctionList. + /// + /// \param M The module we are outlining from. + /// \param CandidateList A list of candidates to be outlined. + /// \param FunctionList A list of functions to be inserted into the module. + /// \param Mapper Contains the instruction mappings for the module. + bool outline(Module &M, const ArrayRef<Candidate> &CandidateList, + std::vector<OutlinedFunction> &FunctionList, + InstructionMapper &Mapper); + + /// Creates a function for \p OF and inserts it into the module. + MachineFunction *createOutlinedFunction(Module &M, const OutlinedFunction &OF, + InstructionMapper &Mapper); + + /// Find potential outlining candidates and store them in \p CandidateList. + /// + /// For each type of potential candidate, also build an \p OutlinedFunction + /// struct containing the information to build the function for that + /// candidate. + /// + /// \param[out] CandidateList Filled with outlining candidates for the module. + /// \param[out] FunctionList Filled with functions corresponding to each type + /// of \p Candidate. + /// \param ST The suffix tree for the module. + /// \param TII TargetInstrInfo for the module. + /// + /// \returns The length of the longest candidate found. 0 if there are none. + unsigned buildCandidateList(std::vector<Candidate> &CandidateList, + std::vector<OutlinedFunction> &FunctionList, + SuffixTree &ST, + InstructionMapper &Mapper, + const TargetInstrInfo &TII); + + /// \brief Remove any overlapping candidates that weren't handled by the + /// suffix tree's pruning method. + /// + /// Pruning from the suffix tree doesn't necessarily remove all overlaps. + /// If a short candidate is chosen for outlining, then a longer candidate + /// which has that short candidate as a suffix is chosen, the tree's pruning + /// method will not find it. Thus, we need to prune before outlining as well. + /// + /// \param[in,out] CandidateList A list of outlining candidates. + /// \param[in,out] FunctionList A list of functions to be outlined. + /// \param MaxCandidateLen The length of the longest candidate. + /// \param TII TargetInstrInfo for the module. + void pruneOverlaps(std::vector<Candidate> &CandidateList, + std::vector<OutlinedFunction> &FunctionList, + unsigned MaxCandidateLen, + const TargetInstrInfo &TII); + + /// Construct a suffix tree on the instructions in \p M and outline repeated + /// strings from that tree. + bool runOnModule(Module &M) override; +}; + +} // Anonymous namespace. + +char MachineOutliner::ID = 0; + +namespace llvm { +ModulePass *createMachineOutlinerPass() { return new MachineOutliner(); } +} + +INITIALIZE_PASS(MachineOutliner, DEBUG_TYPE, + "Machine Function Outliner", false, false) + +void MachineOutliner::pruneOverlaps(std::vector<Candidate> &CandidateList, + std::vector<OutlinedFunction> &FunctionList, + unsigned MaxCandidateLen, + const TargetInstrInfo &TII) { + // TODO: Experiment with interval trees or other interval-checking structures + // to lower the time complexity of this function. + // TODO: Can we do better than the simple greedy choice? + // Check for overlaps in the range. + // This is O(MaxCandidateLen * CandidateList.size()). + for (auto It = CandidateList.begin(), Et = CandidateList.end(); It != Et; + It++) { + Candidate &C1 = *It; + OutlinedFunction &F1 = FunctionList[C1.FunctionIdx]; + + // If we removed this candidate, skip it. + if (!C1.InCandidateList) + continue; + + // Is it still worth it to outline C1? + if (F1.Benefit < 1 || F1.OccurrenceCount < 2) { + assert(F1.OccurrenceCount > 0 && + "Can't remove OutlinedFunction with no occurrences!"); + F1.OccurrenceCount--; + C1.InCandidateList = false; + continue; + } + + // The minimum start index of any candidate that could overlap with this + // one. + unsigned FarthestPossibleIdx = 0; + + // Either the index is 0, or it's at most MaxCandidateLen indices away. + if (C1.StartIdx > MaxCandidateLen) + FarthestPossibleIdx = C1.StartIdx - MaxCandidateLen; + + // Compare against the candidates in the list that start at at most + // FarthestPossibleIdx indices away from C1. There are at most + // MaxCandidateLen of these. + for (auto Sit = It + 1; Sit != Et; Sit++) { + Candidate &C2 = *Sit; + OutlinedFunction &F2 = FunctionList[C2.FunctionIdx]; + + // Is this candidate too far away to overlap? + if (C2.StartIdx < FarthestPossibleIdx) + break; + + // Did we already remove this candidate in a previous step? + if (!C2.InCandidateList) + continue; + + // Is the function beneficial to outline? + if (F2.OccurrenceCount < 2 || F2.Benefit < 1) { + // If not, remove this candidate and move to the next one. + assert(F2.OccurrenceCount > 0 && + "Can't remove OutlinedFunction with no occurrences!"); + F2.OccurrenceCount--; + C2.InCandidateList = false; + continue; + } + + size_t C2End = C2.StartIdx + C2.Len - 1; + + // Do C1 and C2 overlap? + // + // Not overlapping: + // High indices... [C1End ... C1Start][C2End ... C2Start] ...Low indices + // + // We sorted our candidate list so C2Start <= C1Start. We know that + // C2End > C2Start since each candidate has length >= 2. Therefore, all we + // have to check is C2End < C2Start to see if we overlap. + if (C2End < C1.StartIdx) + continue; + + // C1 and C2 overlap. + // We need to choose the better of the two. + // + // Approximate this by picking the one which would have saved us the + // most instructions before any pruning. + if (C1.Benefit >= C2.Benefit) { + + // C1 is better, so remove C2 and update C2's OutlinedFunction to + // reflect the removal. + assert(F2.OccurrenceCount > 0 && + "Can't remove OutlinedFunction with no occurrences!"); + F2.OccurrenceCount--; + F2.Benefit = TII.getOutliningBenefit(F2.Sequence.size(), + F2.OccurrenceCount, + F2.IsTailCall + ); + + C2.InCandidateList = false; + + DEBUG ( + dbgs() << "- Removed C2. \n"; + dbgs() << "--- Num fns left for C2: " << F2.OccurrenceCount << "\n"; + dbgs() << "--- C2's benefit: " << F2.Benefit << "\n"; + ); + + } else { + // C2 is better, so remove C1 and update C1's OutlinedFunction to + // reflect the removal. + assert(F1.OccurrenceCount > 0 && + "Can't remove OutlinedFunction with no occurrences!"); + F1.OccurrenceCount--; + F1.Benefit = TII.getOutliningBenefit(F1.Sequence.size(), + F1.OccurrenceCount, + F1.IsTailCall + ); + C1.InCandidateList = false; + + DEBUG ( + dbgs() << "- Removed C1. \n"; + dbgs() << "--- Num fns left for C1: " << F1.OccurrenceCount << "\n"; + dbgs() << "--- C1's benefit: " << F1.Benefit << "\n"; + ); + + // C1 is out, so we don't have to compare it against anyone else. + break; + } + } + } +} + +unsigned +MachineOutliner::buildCandidateList(std::vector<Candidate> &CandidateList, + std::vector<OutlinedFunction> &FunctionList, + SuffixTree &ST, + InstructionMapper &Mapper, + const TargetInstrInfo &TII) { + + std::vector<unsigned> CandidateSequence; // Current outlining candidate. + size_t MaxCandidateLen = 0; // Length of the longest candidate. + + // Function for maximizing query in the suffix tree. + // This allows us to define more fine-grained types of things to outline in + // the target without putting target-specific info in the suffix tree. + auto BenefitFn = [&TII, &Mapper](const SuffixTreeNode &Curr, + size_t StringLen, unsigned EndVal) { + + // The root represents the empty string. + if (Curr.isRoot()) + return 0u; + + // Is this long enough to outline? + // TODO: Let the target decide how "long" a string is in terms of the sizes + // of the instructions in the string. For example, if a call instruction + // is smaller than a one instruction string, we should outline that string. + if (StringLen < 2) + return 0u; + + size_t Occurrences = Curr.OccurrenceCount; + + // Anything we want to outline has to appear at least twice. + if (Occurrences < 2) + return 0u; + + // Check if the last instruction in the sequence is a return. + MachineInstr *LastInstr = + Mapper.IntegerInstructionMap[EndVal]; + assert(LastInstr && "Last instruction in sequence was unmapped!"); + + // The only way a terminator could be mapped as legal is if it was safe to + // tail call. + bool IsTailCall = LastInstr->isTerminator(); + return TII.getOutliningBenefit(StringLen, Occurrences, IsTailCall); + }; + + MaxCandidateLen = ST.findCandidates(CandidateList, FunctionList, BenefitFn); + + for (auto &OF : FunctionList) + OF.IsTailCall = Mapper. + IntegerInstructionMap[OF.Sequence.back()]->isTerminator(); + + // Sort the candidates in decending order. This will simplify the outlining + // process when we have to remove the candidates from the mapping by + // allowing us to cut them out without keeping track of an offset. + std::stable_sort(CandidateList.begin(), CandidateList.end()); + + return MaxCandidateLen; +} + +MachineFunction * +MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF, + InstructionMapper &Mapper) { + + // Create the function name. This should be unique. For now, just hash the + // module name and include it in the function name plus the number of this + // function. + std::ostringstream NameStream; + NameStream << "OUTLINED_FUNCTION" << "_" << OF.Name; + + // Create the function using an IR-level function. + LLVMContext &C = M.getContext(); + Function *F = dyn_cast<Function>( + M.getOrInsertFunction(NameStream.str(), Type::getVoidTy(C))); + assert(F && "Function was null!"); + + // NOTE: If this is linkonceodr, then we can take advantage of linker deduping + // which gives us better results when we outline from linkonceodr functions. + F->setLinkage(GlobalValue::PrivateLinkage); + F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + + BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F); + IRBuilder<> Builder(EntryBB); + Builder.CreateRetVoid(); + + MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>(); + MachineFunction &MF = MMI.getOrCreateMachineFunction(*F); + MachineBasicBlock &MBB = *MF.CreateMachineBasicBlock(); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + + // Insert the new function into the module. + MF.insert(MF.begin(), &MBB); + + TII.insertOutlinerPrologue(MBB, MF, OF.IsTailCall); + + // Copy over the instructions for the function using the integer mappings in + // its sequence. + for (unsigned Str : OF.Sequence) { + MachineInstr *NewMI = + MF.CloneMachineInstr(Mapper.IntegerInstructionMap.find(Str)->second); + NewMI->dropMemRefs(); + + // Don't keep debug information for outlined instructions. + // FIXME: This means outlined functions are currently undebuggable. + NewMI->setDebugLoc(DebugLoc()); + MBB.insert(MBB.end(), NewMI); + } + + TII.insertOutlinerEpilogue(MBB, MF, OF.IsTailCall); + + return &MF; +} + +bool MachineOutliner::outline(Module &M, + const ArrayRef<Candidate> &CandidateList, + std::vector<OutlinedFunction> &FunctionList, + InstructionMapper &Mapper) { + + bool OutlinedSomething = false; + + // Replace the candidates with calls to their respective outlined functions. + for (const Candidate &C : CandidateList) { + + // Was the candidate removed during pruneOverlaps? + if (!C.InCandidateList) + continue; + + // If not, then look at its OutlinedFunction. + OutlinedFunction &OF = FunctionList[C.FunctionIdx]; + + // Was its OutlinedFunction made unbeneficial during pruneOverlaps? + if (OF.OccurrenceCount < 2 || OF.Benefit < 1) + continue; + + // If not, then outline it. + assert(C.StartIdx < Mapper.InstrList.size() && "Candidate out of bounds!"); + MachineBasicBlock *MBB = (*Mapper.InstrList[C.StartIdx]).getParent(); + MachineBasicBlock::iterator StartIt = Mapper.InstrList[C.StartIdx]; + unsigned EndIdx = C.StartIdx + C.Len - 1; + + assert(EndIdx < Mapper.InstrList.size() && "Candidate out of bounds!"); + MachineBasicBlock::iterator EndIt = Mapper.InstrList[EndIdx]; + assert(EndIt != MBB->end() && "EndIt out of bounds!"); + + EndIt++; // Erase needs one past the end index. + + // Does this candidate have a function yet? + if (!OF.MF) { + OF.MF = createOutlinedFunction(M, OF, Mapper); + FunctionsCreated++; + } + + MachineFunction *MF = OF.MF; + const TargetSubtargetInfo &STI = MF->getSubtarget(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + + // Insert a call to the new function and erase the old sequence. + TII.insertOutlinedCall(M, *MBB, StartIt, *MF, OF.IsTailCall); + StartIt = Mapper.InstrList[C.StartIdx]; + MBB->erase(StartIt, EndIt); + + OutlinedSomething = true; + + // Statistics. + NumOutlined++; + } + + DEBUG ( + dbgs() << "OutlinedSomething = " << OutlinedSomething << "\n"; + ); + + return OutlinedSomething; +} + +bool MachineOutliner::runOnModule(Module &M) { + + // Is there anything in the module at all? + if (M.empty()) + return false; + + MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>(); + const TargetSubtargetInfo &STI = MMI.getOrCreateMachineFunction(*M.begin()) + .getSubtarget(); + const TargetRegisterInfo *TRI = STI.getRegisterInfo(); + const TargetInstrInfo *TII = STI.getInstrInfo(); + + InstructionMapper Mapper; + + // Build instruction mappings for each function in the module. + for (Function &F : M) { + MachineFunction &MF = MMI.getOrCreateMachineFunction(F); + + // Is the function empty? Safe to outline from? + if (F.empty() || !TII->isFunctionSafeToOutlineFrom(MF)) + continue; + + // If it is, look at each MachineBasicBlock in the function. + for (MachineBasicBlock &MBB : MF) { + + // Is there anything in MBB? + if (MBB.empty()) + continue; + + // If yes, map it. + Mapper.convertToUnsignedVec(MBB, *TRI, *TII); + } + } + + // Construct a suffix tree, use it to find candidates, and then outline them. + SuffixTree ST(Mapper.UnsignedVec); + std::vector<Candidate> CandidateList; + std::vector<OutlinedFunction> FunctionList; + + // Find all of the outlining candidates. + unsigned MaxCandidateLen = + buildCandidateList(CandidateList, FunctionList, ST, Mapper, *TII); + + // Remove candidates that overlap with other candidates. + pruneOverlaps(CandidateList, FunctionList, MaxCandidateLen, *TII); + + // Outline each of the candidates and return true if something was outlined. + return outline(M, CandidateList, FunctionList, Mapper); +} diff --git a/contrib/llvm/lib/CodeGen/MachinePipeliner.cpp b/contrib/llvm/lib/CodeGen/MachinePipeliner.cpp index 43a1809..19e9a50 100644 --- a/contrib/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/contrib/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -61,7 +61,6 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/iterator_range.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PriorityQueue.h" #include "llvm/ADT/SetVector.h" @@ -69,6 +68,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/ValueTracking.h" @@ -552,7 +552,9 @@ public: os << "\n"; } - void dump() const { print(dbgs()); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD void dump() const { print(dbgs()); } +#endif }; /// This class repesents the scheduled code. The main data structure is a @@ -593,7 +595,7 @@ private: /// Virtual register information. MachineRegisterInfo &MRI; - DFAPacketizer *Resources; + std::unique_ptr<DFAPacketizer> Resources; public: SMSchedule(MachineFunction *mf) @@ -604,13 +606,6 @@ public: InitiationInterval = 0; } - ~SMSchedule() { - ScheduledInstrs.clear(); - InstrToCycle.clear(); - RegToStageDiff.clear(); - delete Resources; - } - void reset() { ScheduledInstrs.clear(); InstrToCycle.clear(); @@ -720,13 +715,13 @@ char MachinePipeliner::ID = 0; int MachinePipeliner::NumTries = 0; #endif char &llvm::MachinePipelinerID = MachinePipeliner::ID; -INITIALIZE_PASS_BEGIN(MachinePipeliner, "pipeliner", +INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE, "Modulo Software Pipelining", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_END(MachinePipeliner, "pipeliner", +INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE, "Modulo Software Pipelining", false, false) /// The "main" function for implementing Swing Modulo Scheduling. @@ -738,7 +733,7 @@ bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) { return false; if (mf.getFunction()->getAttributes().hasAttribute( - AttributeSet::FunctionIndex, Attribute::OptimizeForSize) && + AttributeList::FunctionIndex, Attribute::OptimizeForSize) && !EnableSWPOptSize.getPosition()) return false; @@ -960,7 +955,7 @@ static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2) if (Phi.getOperand(i + 1).getMBB() != Loop) InitVal = Phi.getOperand(i).getReg(); - else if (Phi.getOperand(i + 1).getMBB() == Loop) + else LoopVal = Phi.getOperand(i).getReg(); assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure."); @@ -2514,7 +2509,7 @@ void SwingSchedulerDAG::generateExistingPhis( MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap, InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum, bool IsLast) { - // Compute the stage number for the inital value of the Phi, which + // Compute the stage number for the initial value of the Phi, which // comes from the prolog. The prolog to use depends on to which kernel/ // epilog that we're adding the Phi. unsigned PrologStage = 0; @@ -3480,7 +3475,7 @@ bool SwingSchedulerDAG::isLoopCarriedOrder(SUnit *Source, const SDep &Dep, // increment value to determine if the accesses may be loop carried. if (OffsetS >= OffsetD) return OffsetS + AccessSizeS > DeltaS; - else if (OffsetS < OffsetD) + else return OffsetD + AccessSizeD > DeltaD; return true; @@ -3980,5 +3975,7 @@ void SMSchedule::print(raw_ostream &os) const { } } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Utility function used for debugging to print the schedule. -void SMSchedule::dump() const { print(dbgs()); } +LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); } +#endif diff --git a/contrib/llvm/lib/CodeGen/MachinePostDominators.cpp b/contrib/llvm/lib/CodeGen/MachinePostDominators.cpp index c3f6e92..4883779 100644 --- a/contrib/llvm/lib/CodeGen/MachinePostDominators.cpp +++ b/contrib/llvm/lib/CodeGen/MachinePostDominators.cpp @@ -16,6 +16,10 @@ using namespace llvm; +namespace llvm { +template class DominatorTreeBase<MachineBasicBlock, true>; // PostDomTreeBase +} + char MachinePostDominatorTree::ID = 0; //declare initializeMachinePostDominatorTreePass @@ -24,8 +28,7 @@ INITIALIZE_PASS(MachinePostDominatorTree, "machinepostdomtree", MachinePostDominatorTree::MachinePostDominatorTree() : MachineFunctionPass(ID) { initializeMachinePostDominatorTreePass(*PassRegistry::getPassRegistry()); - DT = new DominatorTreeBase<MachineBasicBlock>(true); //true indicate - // postdominator + DT = new PostDomTreeBase<MachineBasicBlock>(); } FunctionPass * diff --git a/contrib/llvm/lib/CodeGen/MachineRegionInfo.cpp b/contrib/llvm/lib/CodeGen/MachineRegionInfo.cpp index fc32183..1e74104 100644 --- a/contrib/llvm/lib/CodeGen/MachineRegionInfo.cpp +++ b/contrib/llvm/lib/CodeGen/MachineRegionInfo.cpp @@ -1,10 +1,21 @@ +//===- lib/Codegen/MachineRegionInfo.cpp ----------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// #include "llvm/CodeGen/MachineRegionInfo.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/RegionInfoImpl.h" #include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/Pass.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" -#define DEBUG_TYPE "region" +#define DEBUG_TYPE "machine-region-info" using namespace llvm; @@ -12,36 +23,29 @@ STATISTIC(numMachineRegions, "The # of machine regions"); STATISTIC(numMachineSimpleRegions, "The # of simple machine regions"); namespace llvm { + template class RegionBase<RegionTraits<MachineFunction>>; template class RegionNodeBase<RegionTraits<MachineFunction>>; template class RegionInfoBase<RegionTraits<MachineFunction>>; -} + +} // end namespace llvm //===----------------------------------------------------------------------===// // MachineRegion implementation -// MachineRegion::MachineRegion(MachineBasicBlock *Entry, MachineBasicBlock *Exit, MachineRegionInfo* RI, MachineDominatorTree *DT, MachineRegion *Parent) : - RegionBase<RegionTraits<MachineFunction>>(Entry, Exit, RI, DT, Parent) { - -} + RegionBase<RegionTraits<MachineFunction>>(Entry, Exit, RI, DT, Parent) {} -MachineRegion::~MachineRegion() { } +MachineRegion::~MachineRegion() = default; //===----------------------------------------------------------------------===// // MachineRegionInfo implementation -// - -MachineRegionInfo::MachineRegionInfo() : - RegionInfoBase<RegionTraits<MachineFunction>>() { -} +MachineRegionInfo::MachineRegionInfo() = default; -MachineRegionInfo::~MachineRegionInfo() { - -} +MachineRegionInfo::~MachineRegionInfo() = default; void MachineRegionInfo::updateStatistics(MachineRegion *R) { ++numMachineRegions; @@ -74,9 +78,7 @@ MachineRegionInfoPass::MachineRegionInfoPass() : MachineFunctionPass(ID) { initializeMachineRegionInfoPassPass(*PassRegistry::getPassRegistry()); } -MachineRegionInfoPass::~MachineRegionInfoPass() { - -} +MachineRegionInfoPass::~MachineRegionInfoPass() = default; bool MachineRegionInfoPass::runOnMachineFunction(MachineFunction &F) { releaseMemory(); @@ -86,6 +88,9 @@ bool MachineRegionInfoPass::runOnMachineFunction(MachineFunction &F) { auto DF = &getAnalysis<MachineDominanceFrontier>(); RI.recalculate(F, DT, PDT, DF); + + DEBUG(RI.dump()); + return false; } @@ -103,9 +108,10 @@ void MachineRegionInfoPass::verifyAnalysis() const { void MachineRegionInfoPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); - AU.addRequiredTransitive<DominatorTreeWrapperPass>(); - AU.addRequired<PostDominatorTreeWrapperPass>(); - AU.addRequired<DominanceFrontierWrapperPass>(); + AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachinePostDominatorTree>(); + AU.addRequired<MachineDominanceFrontier>(); + MachineFunctionPass::getAnalysisUsage(AU); } void MachineRegionInfoPass::print(raw_ostream &OS, const Module *) const { @@ -119,22 +125,24 @@ LLVM_DUMP_METHOD void MachineRegionInfoPass::dump() const { #endif char MachineRegionInfoPass::ID = 0; +char &MachineRegionInfoPassID = MachineRegionInfoPass::ID; -INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, "regions", - "Detect single entry single exit regions", true, true) +INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, DEBUG_TYPE, + "Detect single entry single exit regions", true, true) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier) -INITIALIZE_PASS_END(MachineRegionInfoPass, "regions", - "Detect single entry single exit regions", true, true) +INITIALIZE_PASS_END(MachineRegionInfoPass, DEBUG_TYPE, + "Detect single entry single exit regions", true, true) // Create methods available outside of this file, to use them // "include/llvm/LinkAllPasses.h". Otherwise the pass would be deleted by // the link time optimization. namespace llvm { - FunctionPass *createMachineRegionInfoPass() { - return new MachineRegionInfoPass(); - } + +FunctionPass *createMachineRegionInfoPass() { + return new MachineRegionInfoPass(); } +} // end namespace llvm diff --git a/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp index 242cb0b..9a92ee2 100644 --- a/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -1,4 +1,4 @@ -//===-- lib/Codegen/MachineRegisterInfo.cpp -------------------------------===// +//===- lib/Codegen/MachineRegisterInfo.cpp --------------------------------===// // // The LLVM Compiler Infrastructure // @@ -12,12 +12,26 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/CodeGen/LowLevelType.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/Function.h" -#include "llvm/Support/raw_os_ostream.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <cassert> using namespace llvm; @@ -28,9 +42,9 @@ static cl::opt<bool> EnableSubRegLiveness("enable-subreg-liveness", cl::Hidden, void MachineRegisterInfo::Delegate::anchor() {} MachineRegisterInfo::MachineRegisterInfo(MachineFunction *MF) - : MF(MF), TheDelegate(nullptr), - TracksSubRegLiveness(MF->getSubtarget().enableSubRegLiveness() && - EnableSubRegLiveness) { + : MF(MF), TracksSubRegLiveness(MF->getSubtarget().enableSubRegLiveness() && + EnableSubRegLiveness), + IsUpdatedCSRsInitialized(false) { unsigned NumRegs = getTargetRegisterInfo()->getNumRegs(); VRegInfo.reserve(256); RegAllocHints.reserve(256); @@ -444,8 +458,8 @@ LaneBitmask MachineRegisterInfo::getMaxLaneMaskForVReg(unsigned Reg) const { return TRC.getLaneMask(); } -#ifndef NDEBUG -void MachineRegisterInfo::dumpUses(unsigned Reg) const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void MachineRegisterInfo::dumpUses(unsigned Reg) const { for (MachineInstr &I : use_instructions(Reg)) I.dump(); } @@ -543,3 +557,47 @@ bool MachineRegisterInfo::isPhysRegUsed(unsigned PhysReg) const { } return false; } + +void MachineRegisterInfo::disableCalleeSavedRegister(unsigned Reg) { + + const TargetRegisterInfo *TRI = getTargetRegisterInfo(); + assert(Reg && (Reg < TRI->getNumRegs()) && + "Trying to disable an invalid register"); + + if (!IsUpdatedCSRsInitialized) { + const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF); + for (const MCPhysReg *I = CSR; *I; ++I) + UpdatedCSRs.push_back(*I); + + // Zero value represents the end of the register list + // (no more registers should be pushed). + UpdatedCSRs.push_back(0); + + IsUpdatedCSRsInitialized = true; + } + + // Remove the register (and its aliases from the list). + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + UpdatedCSRs.erase(std::remove(UpdatedCSRs.begin(), UpdatedCSRs.end(), *AI), + UpdatedCSRs.end()); +} + +const MCPhysReg *MachineRegisterInfo::getCalleeSavedRegs() const { + if (IsUpdatedCSRsInitialized) + return UpdatedCSRs.data(); + + return getTargetRegisterInfo()->getCalleeSavedRegs(MF); +} + +void MachineRegisterInfo::setCalleeSavedRegs(ArrayRef<MCPhysReg> CSRs) { + if (IsUpdatedCSRsInitialized) + UpdatedCSRs.clear(); + + for (MCPhysReg Reg : CSRs) + UpdatedCSRs.push_back(Reg); + + // Zero value represents the end of the register list + // (no more registers should be pushed). + UpdatedCSRs.push_back(0); + IsUpdatedCSRsInitialized = true; +} diff --git a/contrib/llvm/lib/CodeGen/MachineScheduler.cpp b/contrib/llvm/lib/CodeGen/MachineScheduler.cpp index e06bc51..eaba9a5 100644 --- a/contrib/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/contrib/llvm/lib/CodeGen/MachineScheduler.cpp @@ -13,29 +13,66 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PriorityQueue.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachinePassRegistry.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/RegisterPressure.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include "llvm/CodeGen/ScheduleDAGMutation.h" #include "llvm/CodeGen/ScheduleDFS.h" #include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GraphWriter.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> +#include <limits> +#include <memory> +#include <string> +#include <tuple> +#include <utility> +#include <vector> using namespace llvm; -#define DEBUG_TYPE "misched" +#define DEBUG_TYPE "machine-scheduler" namespace llvm { + cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden, cl::desc("Force top-down list scheduling")); cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden, @@ -43,7 +80,8 @@ cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden, cl::opt<bool> DumpCriticalPathLength("misched-dcpl", cl::Hidden, cl::desc("Print critical path length to stdout")); -} + +} // end namespace llvm #ifndef NDEBUG static cl::opt<bool> ViewMISchedDAGs("view-misched-dags", cl::Hidden, @@ -80,10 +118,6 @@ static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden, cl::desc("Enable memop clustering."), cl::init(true)); -// Experimental heuristics -static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden, - cl::desc("Enable scheduling for macro fusion."), cl::init(true)); - static cl::opt<bool> VerifyScheduling("verify-misched", cl::Hidden, cl::desc("Verify machine instrs before and after machine scheduling")); @@ -92,14 +126,14 @@ static const unsigned MinSubtreeSize = 8; // Pin the vtables to this file. void MachineSchedStrategy::anchor() {} + void ScheduleDAGMutation::anchor() {} //===----------------------------------------------------------------------===// // Machine Instruction Scheduling Pass and Registry //===----------------------------------------------------------------------===// -MachineSchedContext::MachineSchedContext(): - MF(nullptr), MLI(nullptr), MDT(nullptr), PassConfig(nullptr), AA(nullptr), LIS(nullptr) { +MachineSchedContext::MachineSchedContext() { RegClassInfo = new RegisterClassInfo(); } @@ -108,6 +142,7 @@ MachineSchedContext::~MachineSchedContext() { } namespace { + /// Base class for a machine scheduler class that can run at any point. class MachineSchedulerBase : public MachineSchedContext, public MachineFunctionPass { @@ -149,18 +184,20 @@ public: protected: ScheduleDAGInstrs *createPostMachineScheduler(); }; -} // namespace + +} // end anonymous namespace char MachineScheduler::ID = 0; char &llvm::MachineSchedulerID = MachineScheduler::ID; -INITIALIZE_PASS_BEGIN(MachineScheduler, "machine-scheduler", +INITIALIZE_PASS_BEGIN(MachineScheduler, DEBUG_TYPE, "Machine Instruction Scheduler", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_END(MachineScheduler, "machine-scheduler", +INITIALIZE_PASS_END(MachineScheduler, DEBUG_TYPE, "Machine Instruction Scheduler", false, false) MachineScheduler::MachineScheduler() @@ -211,7 +248,7 @@ static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) { /// MachineSchedOpt allows command line selection of the scheduler. static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false, - RegisterPassParser<MachineSchedRegistry> > + RegisterPassParser<MachineSchedRegistry>> MachineSchedOpt("misched", cl::init(&useDefaultMachineSched), cl::Hidden, cl::desc("Machine instruction scheduler to use")); @@ -448,7 +485,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler, // instruction stream until we find the nearest boundary. unsigned NumRegionInstrs = 0; MachineBasicBlock::iterator I = RegionEnd; - for (;I != MBB->begin(); --I) { + for (; I != MBB->begin(); --I) { MachineInstr &MI = *std::prev(I); if (isSchedBoundary(&MI, &*MBB, MF, TII)) break; @@ -495,7 +532,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler, // thumb2 size reduction is currently an exception, so the PostMIScheduler // needs to do this. if (FixKillFlags) - Scheduler.fixupKills(&*MBB); + Scheduler.fixupKills(*MBB); } Scheduler.finalizeSchedule(); } @@ -504,13 +541,14 @@ void MachineSchedulerBase::print(raw_ostream &O, const Module* m) const { // unimplemented } -LLVM_DUMP_METHOD -void ReadyQueue::dump() { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void ReadyQueue::dump() const { dbgs() << "Queue " << Name << ": "; - for (unsigned i = 0, e = Queue.size(); i < e; ++i) - dbgs() << Queue[i]->NodeNum << " "; + for (const SUnit *SU : Queue) + dbgs() << SU->NodeNum << " "; dbgs() << "\n"; } +#endif //===----------------------------------------------------------------------===// // ScheduleDAGMI - Basic machine instruction scheduling. This is @@ -519,8 +557,7 @@ void ReadyQueue::dump() { // ===----------------------------------------------------------------------===/ // Provide a vtable anchor. -ScheduleDAGMI::~ScheduleDAGMI() { -} +ScheduleDAGMI::~ScheduleDAGMI() = default; bool ScheduleDAGMI::canAddEdge(SUnit *SuccSU, SUnit *PredSU) { return SuccSU == &ExitSU || !Topo.IsReachable(PredSU, SuccSU); @@ -572,10 +609,8 @@ void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) { /// releaseSuccessors - Call releaseSucc on each of SU's successors. void ScheduleDAGMI::releaseSuccessors(SUnit *SU) { - for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); - I != E; ++I) { - releaseSucc(SU, &*I); - } + for (SDep &Succ : SU->Succs) + releaseSucc(SU, &Succ); } /// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When @@ -611,10 +646,8 @@ void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) { /// releasePredecessors - Call releasePred on each of SU's predecessors. void ScheduleDAGMI::releasePredecessors(SUnit *SU) { - for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - releasePred(SU, &*I); - } + for (SDep &Pred : SU->Preds) + releasePred(SU, &Pred); } /// enterRegion - Called back from MachineScheduler::runOnMachineFunction after @@ -687,8 +720,8 @@ void ScheduleDAGMI::schedule() { DEBUG( if (EntrySU.getInstr() != nullptr) EntrySU.dumpAll(this); - for (unsigned su = 0, e = SUnits.size(); su != e; ++su) - SUnits[su].dumpAll(this); + for (const SUnit &SU : SUnits) + SU.dumpAll(this); if (ExitSU.getInstr() != nullptr) ExitSU.dumpAll(this); ); @@ -749,28 +782,25 @@ void ScheduleDAGMI::schedule() { /// Apply each ScheduleDAGMutation step in order. void ScheduleDAGMI::postprocessDAG() { - for (unsigned i = 0, e = Mutations.size(); i < e; ++i) { - Mutations[i]->apply(this); - } + for (auto &m : Mutations) + m->apply(this); } void ScheduleDAGMI:: findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots, SmallVectorImpl<SUnit*> &BotRoots) { - for (std::vector<SUnit>::iterator - I = SUnits.begin(), E = SUnits.end(); I != E; ++I) { - SUnit *SU = &(*I); - assert(!SU->isBoundaryNode() && "Boundary node should not be in SUnits"); + for (SUnit &SU : SUnits) { + assert(!SU.isBoundaryNode() && "Boundary node should not be in SUnits"); // Order predecessors so DFSResult follows the critical path. - SU->biasCriticalPath(); + SU.biasCriticalPath(); // A SUnit is ready to top schedule if it has no predecessors. - if (!I->NumPredsLeft) - TopRoots.push_back(SU); + if (!SU.NumPredsLeft) + TopRoots.push_back(&SU); // A SUnit is ready to bottom schedule if it has no successors. - if (!I->NumSuccsLeft) - BotRoots.push_back(SU); + if (!SU.NumSuccsLeft) + BotRoots.push_back(&SU); } ExitSU.biasCriticalPath(); } @@ -785,10 +815,9 @@ void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots, // // Nodes with unreleased weak edges can still be roots. // Release top roots in forward order. - for (SmallVectorImpl<SUnit*>::const_iterator - I = TopRoots.begin(), E = TopRoots.end(); I != E; ++I) { - SchedImpl->releaseTopNode(*I); - } + for (SUnit *SU : TopRoots) + SchedImpl->releaseTopNode(SU); + // Release bottom roots in reverse order so the higher priority nodes appear // first. This is more natural and slightly more efficient. for (SmallVectorImpl<SUnit*>::const_reverse_iterator @@ -825,7 +854,7 @@ void ScheduleDAGMI::placeDebugValues() { RegionBegin = FirstDbgValue; } - for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator + for (std::vector<std::pair<MachineInstr *, MachineInstr *>>::iterator DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) { std::pair<MachineInstr *, MachineInstr *> P = *std::prev(DI); MachineInstr *DbgValue = P.first; @@ -841,7 +870,7 @@ void ScheduleDAGMI::placeDebugValues() { } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void ScheduleDAGMI::dumpSchedule() const { +LLVM_DUMP_METHOD void ScheduleDAGMI::dumpSchedule() const { for (MachineBasicBlock::iterator MI = begin(), ME = end(); MI != ME; ++MI) { if (SUnit *SU = getSUnit(&(*MI))) SU->dump(this); @@ -992,9 +1021,9 @@ void ScheduleDAGMILive::initRegPressure() { } } DEBUG(dbgs() << "Excess PSets: "; - for (unsigned i = 0, e = RegionCriticalPSets.size(); i != e; ++i) + for (const PressureChange &RCPS : RegionCriticalPSets) dbgs() << TRI->getRegPressureSetName( - RegionCriticalPSets[i].getPSet()) << " "; + RCPS.getPSet()) << " "; dbgs() << "\n"); } @@ -1003,16 +1032,15 @@ updateScheduledPressure(const SUnit *SU, const std::vector<unsigned> &NewMaxPressure) { const PressureDiff &PDiff = getPressureDiff(SU); unsigned CritIdx = 0, CritEnd = RegionCriticalPSets.size(); - for (PressureDiff::const_iterator I = PDiff.begin(), E = PDiff.end(); - I != E; ++I) { - if (!I->isValid()) + for (const PressureChange &PC : PDiff) { + if (!PC.isValid()) break; - unsigned ID = I->getPSet(); + unsigned ID = PC.getPSet(); while (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() < ID) ++CritIdx; if (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() == ID) { if ((int)NewMaxPressure[ID] > RegionCriticalPSets[CritIdx].getUnitInc() - && NewMaxPressure[ID] <= INT16_MAX) + && NewMaxPressure[ID] <= (unsigned)std::numeric_limits<int16_t>::max()) RegionCriticalPSets[CritIdx].setUnitInc(NewMaxPressure[ID]); } unsigned Limit = RegClassInfo->getRegPressureSetLimit(ID); @@ -1136,6 +1164,12 @@ void ScheduleDAGMILive::schedule() { dbgs() << " Pressure Diff : "; getPressureDiff(&SU).dump(*TRI); } + dbgs() << " Single Issue : "; + if (SchedModel.mustBeginGroup(SU.getInstr()) && + SchedModel.mustEndGroup(SU.getInstr())) + dbgs() << "true;"; + else + dbgs() << "false;"; dbgs() << '\n'; } if (ExitSU.getInstr() != nullptr) @@ -1396,6 +1430,7 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) { //===----------------------------------------------------------------------===// namespace { + /// \brief Post-process the DAG to create cluster edges between neighboring /// loads or between neighboring stores. class BaseMemOpClusterMutation : public ScheduleDAGMutation { @@ -1403,6 +1438,7 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation { SUnit *SU; unsigned BaseReg; int64_t Offset; + MemOpInfo(SUnit *su, unsigned reg, int64_t ofs) : SU(su), BaseReg(reg), Offset(ofs) {} @@ -1439,31 +1475,31 @@ public: LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri) : BaseMemOpClusterMutation(tii, tri, true) {} }; -} // anonymous + +} // end anonymous namespace namespace llvm { std::unique_ptr<ScheduleDAGMutation> createLoadClusterDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI) { - return EnableMemOpCluster ? make_unique<LoadClusterMutation>(TII, TRI) + return EnableMemOpCluster ? llvm::make_unique<LoadClusterMutation>(TII, TRI) : nullptr; } std::unique_ptr<ScheduleDAGMutation> createStoreClusterDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI) { - return EnableMemOpCluster ? make_unique<StoreClusterMutation>(TII, TRI) + return EnableMemOpCluster ? llvm::make_unique<StoreClusterMutation>(TII, TRI) : nullptr; } -} // namespace llvm +} // end namespace llvm void BaseMemOpClusterMutation::clusterNeighboringMemOps( ArrayRef<SUnit *> MemOps, ScheduleDAGMI *DAG) { SmallVector<MemOpInfo, 32> MemOpRecords; - for (unsigned Idx = 0, End = MemOps.size(); Idx != End; ++Idx) { - SUnit *SU = MemOps[Idx]; + for (SUnit *SU : MemOps) { unsigned BaseReg; int64_t Offset; if (TII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseReg, Offset, TRI)) @@ -1491,12 +1527,11 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps( // dependent on SUa can prevent load combining due to register reuse. // Predecessor edges do not need to be copied from SUb to SUa since nearby // loads should have effectively the same inputs. - for (SUnit::const_succ_iterator - SI = SUa->Succs.begin(), SE = SUa->Succs.end(); SI != SE; ++SI) { - if (SI->getSUnit() == SUb) + for (const SDep &Succ : SUa->Succs) { + if (Succ.getSUnit() == SUb) continue; - DEBUG(dbgs() << " Copy Succ SU(" << SI->getSUnit()->NodeNum << ")\n"); - DAG->addEdge(SI->getSUnit(), SDep(SUb, SDep::Artificial)); + DEBUG(dbgs() << " Copy Succ SU(" << Succ.getSUnit()->NodeNum << ")\n"); + DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial)); } ++ClusterLength; } else @@ -1513,17 +1548,15 @@ void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) { DenseMap<unsigned, unsigned> StoreChainIDs; // Map each store chain to a set of dependent MemOps. SmallVector<SmallVector<SUnit*,4>, 32> StoreChainDependents; - for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) { - SUnit *SU = &DAG->SUnits[Idx]; - if ((IsLoad && !SU->getInstr()->mayLoad()) || - (!IsLoad && !SU->getInstr()->mayStore())) + for (SUnit &SU : DAG->SUnits) { + if ((IsLoad && !SU.getInstr()->mayLoad()) || + (!IsLoad && !SU.getInstr()->mayStore())) continue; unsigned ChainPredID = DAG->SUnits.size(); - for (SUnit::const_pred_iterator - PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) { - if (PI->isCtrl()) { - ChainPredID = PI->getSUnit()->NodeNum; + for (const SDep &Pred : SU.Preds) { + if (Pred.isCtrl()) { + ChainPredID = Pred.getSUnit()->NodeNum; break; } } @@ -1534,82 +1567,12 @@ void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) { StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains)); if (Result.second) StoreChainDependents.resize(NumChains + 1); - StoreChainDependents[Result.first->second].push_back(SU); + StoreChainDependents[Result.first->second].push_back(&SU); } // Iterate over the store chains. - for (unsigned Idx = 0, End = StoreChainDependents.size(); Idx != End; ++Idx) - clusterNeighboringMemOps(StoreChainDependents[Idx], DAG); -} - -//===----------------------------------------------------------------------===// -// MacroFusion - DAG post-processing to encourage fusion of macro ops. -//===----------------------------------------------------------------------===// - -namespace { -/// \brief Post-process the DAG to create cluster edges between instructions -/// that may be fused by the processor into a single operation. -class MacroFusion : public ScheduleDAGMutation { - const TargetInstrInfo &TII; -public: - MacroFusion(const TargetInstrInfo &TII) - : TII(TII) {} - - void apply(ScheduleDAGInstrs *DAGInstrs) override; -}; -} // anonymous - -namespace llvm { - -std::unique_ptr<ScheduleDAGMutation> -createMacroFusionDAGMutation(const TargetInstrInfo *TII) { - return EnableMacroFusion ? make_unique<MacroFusion>(*TII) : nullptr; -} - -} // namespace llvm - -/// \brief Callback from DAG postProcessing to create cluster edges to encourage -/// fused operations. -void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { - ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); - - // For now, assume targets can only fuse with the branch. - SUnit &ExitSU = DAG->ExitSU; - MachineInstr *Branch = ExitSU.getInstr(); - if (!Branch) - return; - - for (SDep &PredDep : ExitSU.Preds) { - if (PredDep.isWeak()) - continue; - SUnit &SU = *PredDep.getSUnit(); - MachineInstr &Pred = *SU.getInstr(); - if (!TII.shouldScheduleAdjacent(Pred, *Branch)) - continue; - - // Create a single weak edge from SU to ExitSU. The only effect is to cause - // bottom-up scheduling to heavily prioritize the clustered SU. There is no - // need to copy predecessor edges from ExitSU to SU, since top-down - // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling - // of SU, we could create an artificial edge from the deepest root, but it - // hasn't been needed yet. - bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster)); - (void)Success; - assert(Success && "No DAG nodes should be reachable from ExitSU"); - - // Adjust latency of data deps between the nodes. - for (SDep &PredDep : ExitSU.Preds) { - if (PredDep.getSUnit() == &SU) - PredDep.setLatency(0); - } - for (SDep &SuccDep : SU.Succs) { - if (SuccDep.getSUnit() == &ExitSU) - SuccDep.setLatency(0); - } - - DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n"); - break; - } + for (auto &SCD : StoreChainDependents) + clusterNeighboringMemOps(SCD, DAG); } //===----------------------------------------------------------------------===// @@ -1617,6 +1580,7 @@ void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { //===----------------------------------------------------------------------===// namespace { + /// \brief Post-process the DAG to create weak edges from all uses of a copy to /// the one use that defines the copy's source vreg, most likely an induction /// variable increment. @@ -1626,6 +1590,7 @@ class CopyConstrain : public ScheduleDAGMutation { // RegionEndIdx is the slot index of the last non-debug instruction in the // scheduling region. So we may have RegionBeginIdx == RegionEndIdx. SlotIndex RegionEndIdx; + public: CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {} @@ -1634,17 +1599,18 @@ public: protected: void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG); }; -} // anonymous + +} // end anonymous namespace namespace llvm { std::unique_ptr<ScheduleDAGMutation> createCopyConstrainDAGMutation(const TargetInstrInfo *TII, - const TargetRegisterInfo *TRI) { - return make_unique<CopyConstrain>(TII, TRI); + const TargetRegisterInfo *TRI) { + return llvm::make_unique<CopyConstrain>(TII, TRI); } -} // namespace llvm +} // end namespace llvm /// constrainLocalCopy handles two possibilities: /// 1) Local src: @@ -1749,16 +1715,14 @@ void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) { const VNInfo *LastLocalVN = LocalLI->getVNInfoBefore(LocalLI->endIndex()); MachineInstr *LastLocalDef = LIS->getInstructionFromIndex(LastLocalVN->def); SUnit *LastLocalSU = DAG->getSUnit(LastLocalDef); - for (SUnit::const_succ_iterator - I = LastLocalSU->Succs.begin(), E = LastLocalSU->Succs.end(); - I != E; ++I) { - if (I->getKind() != SDep::Data || I->getReg() != LocalReg) + for (const SDep &Succ : LastLocalSU->Succs) { + if (Succ.getKind() != SDep::Data || Succ.getReg() != LocalReg) continue; - if (I->getSUnit() == GlobalSU) + if (Succ.getSUnit() == GlobalSU) continue; - if (!DAG->canAddEdge(GlobalSU, I->getSUnit())) + if (!DAG->canAddEdge(GlobalSU, Succ.getSUnit())) return; - LocalUses.push_back(I->getSUnit()); + LocalUses.push_back(Succ.getSUnit()); } // Open the top of the GlobalLI hole by constraining any earlier global uses // to precede the start of LocalLI. @@ -1766,15 +1730,14 @@ void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) { MachineInstr *FirstLocalDef = LIS->getInstructionFromIndex(LocalLI->beginIndex()); SUnit *FirstLocalSU = DAG->getSUnit(FirstLocalDef); - for (SUnit::const_pred_iterator - I = GlobalSU->Preds.begin(), E = GlobalSU->Preds.end(); I != E; ++I) { - if (I->getKind() != SDep::Anti || I->getReg() != GlobalReg) + for (const SDep &Pred : GlobalSU->Preds) { + if (Pred.getKind() != SDep::Anti || Pred.getReg() != GlobalReg) continue; - if (I->getSUnit() == FirstLocalSU) + if (Pred.getSUnit() == FirstLocalSU) continue; - if (!DAG->canAddEdge(FirstLocalSU, I->getSUnit())) + if (!DAG->canAddEdge(FirstLocalSU, Pred.getSUnit())) return; - GlobalUses.push_back(I->getSUnit()); + GlobalUses.push_back(Pred.getSUnit()); } DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n"); // Add the weak edges. @@ -1805,12 +1768,11 @@ void CopyConstrain::apply(ScheduleDAGInstrs *DAGInstrs) { RegionEndIdx = DAG->getLIS()->getInstructionIndex( *priorNonDebug(DAG->end(), DAG->begin())); - for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) { - SUnit *SU = &DAG->SUnits[Idx]; - if (!SU->getInstr()->isCopy()) + for (SUnit &SU : DAG->SUnits) { + if (!SU.getInstr()->isCopy()) continue; - constrainLocalCopy(SU, static_cast<ScheduleDAGMILive*>(DAG)); + constrainLocalCopy(&SU, static_cast<ScheduleDAGMILive*>(DAG)); } } @@ -1836,7 +1798,7 @@ void SchedBoundary::reset() { CheckPending = false; CurrCycle = 0; CurrMOps = 0; - MinReadyCycle = UINT_MAX; + MinReadyCycle = std::numeric_limits<unsigned>::max(); ExpectedLatency = 0; DependentLatency = 0; RetiredMOps = 0; @@ -1861,10 +1823,9 @@ init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) { if (!SchedModel->hasInstrSchedModel()) return; RemainingCounts.resize(SchedModel->getNumProcResourceKinds()); - for (std::vector<SUnit>::iterator - I = DAG->SUnits.begin(), E = DAG->SUnits.end(); I != E; ++I) { - const MCSchedClassDesc *SC = DAG->getSchedClass(&*I); - RemIssueCount += SchedModel->getNumMicroOps(I->getInstr(), SC) + for (SUnit &SU : DAG->SUnits) { + const MCSchedClassDesc *SC = DAG->getSchedClass(&SU); + RemIssueCount += SchedModel->getNumMicroOps(SU.getInstr(), SC) * SchedModel->getMicroOpFactor(); for (TargetSchedModel::ProcResIter PI = SchedModel->getWriteProcResBegin(SC), @@ -1937,12 +1898,22 @@ bool SchedBoundary::checkHazard(SUnit *SU) { && HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard) { return true; } + unsigned uops = SchedModel->getNumMicroOps(SU->getInstr()); if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) { DEBUG(dbgs() << " SU(" << SU->NodeNum << ") uops=" << SchedModel->getNumMicroOps(SU->getInstr()) << '\n'); return true; } + + if (CurrMOps > 0 && + ((isTop() && SchedModel->mustBeginGroup(SU->getInstr())) || + (!isTop() && SchedModel->mustEndGroup(SU->getInstr())))) { + DEBUG(dbgs() << " hazard: SU(" << SU->NodeNum << ") must " + << (isTop()? "begin" : "end") << " group\n"); + return true; + } + if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) { const MCSchedClassDesc *SC = DAG->getSchedClass(SU); for (TargetSchedModel::ProcResIter @@ -1968,12 +1939,11 @@ unsigned SchedBoundary:: findMaxLatency(ArrayRef<SUnit*> ReadySUs) { SUnit *LateSU = nullptr; unsigned RemLatency = 0; - for (ArrayRef<SUnit*>::iterator I = ReadySUs.begin(), E = ReadySUs.end(); - I != E; ++I) { - unsigned L = getUnscheduledLatency(*I); + for (SUnit *SU : ReadySUs) { + unsigned L = getUnscheduledLatency(SU); if (L > RemLatency) { RemLatency = L; - LateSU = *I; + LateSU = SU; } } if (LateSU) { @@ -2039,7 +2009,8 @@ void SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle) { /// Move the boundary of scheduled code by one cycle. void SchedBoundary::bumpCycle(unsigned NextCycle) { if (SchedModel->getMicroOpBufferSize() == 0) { - assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized"); + assert(MinReadyCycle < std::numeric_limits<unsigned>::max() && + "MinReadyCycle uninitialized"); if (MinReadyCycle > NextCycle) NextCycle = MinReadyCycle; } @@ -2237,6 +2208,18 @@ void SchedBoundary::bumpNode(SUnit *SU) { // one cycle. Since we commonly reach the max MOps here, opportunistically // bump the cycle to avoid uselessly checking everything in the readyQ. CurrMOps += IncMOps; + + // Bump the cycle count for issue group constraints. + // This must be done after NextCycle has been adjust for all other stalls. + // Calling bumpCycle(X) will reduce CurrMOps by one issue group and set + // currCycle to X. + if ((isTop() && SchedModel->mustEndGroup(SU->getInstr())) || + (!isTop() && SchedModel->mustBeginGroup(SU->getInstr()))) { + DEBUG(dbgs() << " Bump cycle to " + << (isTop() ? "end" : "begin") << " group\n"); + bumpCycle(++NextCycle); + } + while (CurrMOps >= SchedModel->getIssueWidth()) { DEBUG(dbgs() << " *** Max MOps " << CurrMOps << " at cycle " << CurrCycle << '\n'); @@ -2250,7 +2233,7 @@ void SchedBoundary::bumpNode(SUnit *SU) { void SchedBoundary::releasePending() { // If the available queue is empty, it is safe to reset MinReadyCycle. if (Available.empty()) - MinReadyCycle = UINT_MAX; + MinReadyCycle = std::numeric_limits<unsigned>::max(); // Check to see if any of the pending instructions are ready to issue. If // so, add them to the available queue. @@ -2323,10 +2306,10 @@ SUnit *SchedBoundary::pickOnlyChoice() { return nullptr; } -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) // This is useful information to dump after bumpNode. // Note that the Queue contents are more useful before pickNodeFromQueue. -void SchedBoundary::dumpScheduledState() { +LLVM_DUMP_METHOD void SchedBoundary::dumpScheduledState() const { unsigned ResFactor; unsigned ResCount; if (ZoneCritResIdx) { @@ -2665,12 +2648,15 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin, } } -void GenericScheduler::dumpPolicy() { +void GenericScheduler::dumpPolicy() const { + // Cannot completely remove virtual function even in release mode. +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) dbgs() << "GenericScheduler RegionPolicy: " << " ShouldTrackPressure=" << RegionPolicy.ShouldTrackPressure << " OnlyTopDown=" << RegionPolicy.OnlyTopDown << " OnlyBottomUp=" << RegionPolicy.OnlyBottomUp << "\n"; +#endif } /// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic @@ -2714,17 +2700,16 @@ void GenericScheduler::registerRoots() { Rem.CriticalPath = DAG->ExitSU.getDepth(); // Some roots may not feed into ExitSU. Check all of them in case. - for (std::vector<SUnit*>::const_iterator - I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) { - if ((*I)->getDepth() > Rem.CriticalPath) - Rem.CriticalPath = (*I)->getDepth(); + for (const SUnit *SU : Bot.Available) { + if (SU->getDepth() > Rem.CriticalPath) + Rem.CriticalPath = SU->getDepth(); } DEBUG(dbgs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << '\n'); if (DumpCriticalPathLength) { errs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << " \n"; } - if (EnableCyclicPath) { + if (EnableCyclicPath && SchedModel->getMicroOpBufferSize() > 0) { Rem.CyclicCritPath = DAG->computeCyclicCriticalPath(); checkAcyclicLatency(); } @@ -2964,10 +2949,10 @@ void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone, RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); ReadyQueue &Q = Zone.Available; - for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) { + for (SUnit *SU : Q) { SchedCandidate TryCand(ZonePolicy); - initCandidate(TryCand, *I, Zone.isTop(), RPTracker, TempTracker); + initCandidate(TryCand, SU, Zone.isTop(), RPTracker, TempTracker); // Pass SchedBoundary only when comparing nodes from the same boundary. SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; tryCandidate(Cand, TryCand, ZoneArg); @@ -3106,7 +3091,6 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) { } void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) { - MachineBasicBlock::iterator InsertPos = SU->getInstr(); if (!isTop) ++InsertPos; @@ -3114,18 +3098,17 @@ void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) { // Find already scheduled copies with a single physreg dependence and move // them just above the scheduled instruction. - for (SmallVectorImpl<SDep>::iterator I = Deps.begin(), E = Deps.end(); - I != E; ++I) { - if (I->getKind() != SDep::Data || !TRI->isPhysicalRegister(I->getReg())) + for (SDep &Dep : Deps) { + if (Dep.getKind() != SDep::Data || !TRI->isPhysicalRegister(Dep.getReg())) continue; - SUnit *DepSU = I->getSUnit(); + SUnit *DepSU = Dep.getSUnit(); if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1) continue; MachineInstr *Copy = DepSU->getInstr(); if (!Copy->isCopy()) continue; DEBUG(dbgs() << " Rescheduling physreg copy "; - I->getSUnit()->dump(DAG)); + Dep.getSUnit()->dump(DAG)); DAG->moveInstruction(Copy, InsertPos); } } @@ -3154,7 +3137,8 @@ void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) { /// Create the standard converging machine scheduler. This will be used as the /// default scheduler if the target does not set a default. ScheduleDAGMILive *llvm::createGenericSchedLive(MachineSchedContext *C) { - ScheduleDAGMILive *DAG = new ScheduleDAGMILive(C, make_unique<GenericScheduler>(C)); + ScheduleDAGMILive *DAG = + new ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)); // Register DAG post-processors. // // FIXME: extend the mutation API to allow earlier mutations to instantiate @@ -3195,15 +3179,13 @@ void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) { } } - void PostGenericScheduler::registerRoots() { Rem.CriticalPath = DAG->ExitSU.getDepth(); // Some roots may not feed into ExitSU. Check all of them in case. - for (SmallVectorImpl<SUnit*>::const_iterator - I = BotRoots.begin(), E = BotRoots.end(); I != E; ++I) { - if ((*I)->getDepth() > Rem.CriticalPath) - Rem.CriticalPath = (*I)->getDepth(); + for (const SUnit *SU : BotRoots) { + if (SU->getDepth() > Rem.CriticalPath) + Rem.CriticalPath = SU->getDepth(); } DEBUG(dbgs() << "Critical Path: (PGS-RR) " << Rem.CriticalPath << '\n'); if (DumpCriticalPathLength) { @@ -3229,6 +3211,12 @@ void PostGenericScheduler::tryCandidate(SchedCandidate &Cand, Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) return; + // Keep clustered nodes together. + if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(), + Cand.SU == DAG->getNextClusterSucc(), + TryCand, Cand, Cluster)) + return; + // Avoid critical resource consumption and balance the schedule. if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, TryCand, Cand, ResourceReduce)) @@ -3250,9 +3238,9 @@ void PostGenericScheduler::tryCandidate(SchedCandidate &Cand, void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) { ReadyQueue &Q = Top.Available; - for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) { + for (SUnit *SU : Q) { SchedCandidate TryCand(Cand.Policy); - TryCand.SU = *I; + TryCand.SU = SU; TryCand.AtTop = true; TryCand.initResourceDelta(DAG, SchedModel); tryCandidate(Cand, TryCand); @@ -3302,7 +3290,7 @@ void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) { } ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) { - return new ScheduleDAGMI(C, make_unique<PostGenericScheduler>(C), + return new ScheduleDAGMI(C, llvm::make_unique<PostGenericScheduler>(C), /*RemoveKillFlags=*/true); } @@ -3311,14 +3299,14 @@ ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) { //===----------------------------------------------------------------------===// namespace { + /// \brief Order nodes by the ILP metric. struct ILPOrder { - const SchedDFSResult *DFSResult; - const BitVector *ScheduledTrees; + const SchedDFSResult *DFSResult = nullptr; + const BitVector *ScheduledTrees = nullptr; bool MaximizeILP; - ILPOrder(bool MaxILP) - : DFSResult(nullptr), ScheduledTrees(nullptr), MaximizeILP(MaxILP) {} + ILPOrder(bool MaxILP) : MaximizeILP(MaxILP) {} /// \brief Apply a less-than relation on node priority. /// @@ -3347,12 +3335,13 @@ struct ILPOrder { /// \brief Schedule based on the ILP metric. class ILPScheduler : public MachineSchedStrategy { - ScheduleDAGMILive *DAG; + ScheduleDAGMILive *DAG = nullptr; ILPOrder Cmp; std::vector<SUnit*> ReadyQ; + public: - ILPScheduler(bool MaximizeILP): DAG(nullptr), Cmp(MaximizeILP) {} + ILPScheduler(bool MaximizeILP) : Cmp(MaximizeILP) {} void initialize(ScheduleDAGMI *dag) override { assert(dag->hasVRegLiveness() && "ILPScheduler needs vreg liveness"); @@ -3405,14 +3394,16 @@ public: std::push_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); } }; -} // namespace + +} // end anonymous namespace static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) { - return new ScheduleDAGMILive(C, make_unique<ILPScheduler>(true)); + return new ScheduleDAGMILive(C, llvm::make_unique<ILPScheduler>(true)); } static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) { - return new ScheduleDAGMILive(C, make_unique<ILPScheduler>(false)); + return new ScheduleDAGMILive(C, llvm::make_unique<ILPScheduler>(false)); } + static MachineSchedRegistry ILPMaxRegistry( "ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler); static MachineSchedRegistry ILPMinRegistry( @@ -3424,6 +3415,7 @@ static MachineSchedRegistry ILPMinRegistry( #ifndef NDEBUG namespace { + /// Apply a less-than relation on the node order, which corresponds to the /// instruction order prior to scheduling. IsReverse implements greater-than. template<bool IsReverse> @@ -3444,11 +3436,12 @@ class InstructionShuffler : public MachineSchedStrategy { // Using a less-than relation (SUnitOrder<false>) for the TopQ priority // gives nodes with a higher number higher priority causing the latest // instructions to be scheduled first. - PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false> > + PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false>> TopQ; // When scheduling bottom-up, use greater-than as the queue priority. - PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true> > + PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true>> BottomQ; + public: InstructionShuffler(bool alternate, bool topdown) : IsAlternating(alternate), IsTopDown(topdown) {} @@ -3492,15 +3485,18 @@ public: BottomQ.push(SU); } }; -} // namespace + +} // end anonymous namespace static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) { bool Alternate = !ForceTopDown && !ForceBottomUp; bool TopDown = !ForceBottomUp; assert((TopDown || !ForceTopDown) && "-misched-topdown incompatible with -misched-bottomup"); - return new ScheduleDAGMILive(C, make_unique<InstructionShuffler>(Alternate, TopDown)); + return new ScheduleDAGMILive( + C, llvm::make_unique<InstructionShuffler>(Alternate, TopDown)); } + static MachineSchedRegistry ShufflerRegistry( "shuffle", "Shuffle machine instructions alternating directions", createInstructionShuffler); @@ -3518,8 +3514,7 @@ template<> struct GraphTraits< template<> struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits { - - DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {} + DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} static std::string getGraphName(const ScheduleDAG *G) { return G->MF.getName(); @@ -3576,7 +3571,8 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits { return Str; } }; -} // namespace llvm + +} // end namespace llvm #endif // NDEBUG /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG diff --git a/contrib/llvm/lib/CodeGen/MachineSink.cpp b/contrib/llvm/lib/CodeGen/MachineSink.cpp index 5f87b68..79e3fea 100644 --- a/contrib/llvm/lib/CodeGen/MachineSink.cpp +++ b/contrib/llvm/lib/CodeGen/MachineSink.cpp @@ -16,7 +16,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SparseBitVector.h" @@ -33,6 +32,7 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -173,14 +173,14 @@ namespace { char MachineSinking::ID = 0; char &llvm::MachineSinkingID = MachineSinking::ID; -INITIALIZE_PASS_BEGIN(MachineSinking, "machine-sink", - "Machine code sinking", false, false) +INITIALIZE_PASS_BEGIN(MachineSinking, DEBUG_TYPE, + "Machine code sinking", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(MachineSinking, "machine-sink", - "Machine code sinking", false, false) +INITIALIZE_PASS_END(MachineSinking, DEBUG_TYPE, + "Machine code sinking", false, false) bool MachineSinking::PerformTrivialForwardCoalescing(MachineInstr &MI, MachineBasicBlock *MBB) { diff --git a/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp index ef7e525..6c5abc6 100644 --- a/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp +++ b/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp @@ -1,4 +1,4 @@ -//===- lib/CodeGen/MachineTraceMetrics.cpp ----------------------*- C++ -*-===// +//===- lib/CodeGen/MachineTraceMetrics.cpp --------------------------------===// // // The LLVM Compiler Infrastructure // @@ -8,20 +8,34 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/MachineTraceMetrics.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/SparseSet.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <iterator> +#include <tuple> +#include <utility> using namespace llvm; @@ -30,16 +44,14 @@ using namespace llvm; char MachineTraceMetrics::ID = 0; char &llvm::MachineTraceMetricsID = MachineTraceMetrics::ID; -INITIALIZE_PASS_BEGIN(MachineTraceMetrics, - "machine-trace-metrics", "Machine Trace Metrics", false, true) +INITIALIZE_PASS_BEGIN(MachineTraceMetrics, DEBUG_TYPE, + "Machine Trace Metrics", false, true) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(MachineTraceMetrics, - "machine-trace-metrics", "Machine Trace Metrics", false, true) +INITIALIZE_PASS_END(MachineTraceMetrics, DEBUG_TYPE, + "Machine Trace Metrics", false, true) -MachineTraceMetrics::MachineTraceMetrics() - : MachineFunctionPass(ID), MF(nullptr), TII(nullptr), TRI(nullptr), - MRI(nullptr), Loops(nullptr) { +MachineTraceMetrics::MachineTraceMetrics() : MachineFunctionPass(ID) { std::fill(std::begin(Ensembles), std::end(Ensembles), nullptr); } @@ -137,7 +149,6 @@ MachineTraceMetrics::getProcResourceCycles(unsigned MBBNum) const { return makeArrayRef(ProcResourceCycles.data() + MBBNum * PRKinds, PRKinds); } - //===----------------------------------------------------------------------===// // Ensemble utility functions //===----------------------------------------------------------------------===// @@ -151,7 +162,7 @@ MachineTraceMetrics::Ensemble::Ensemble(MachineTraceMetrics *ct) } // Virtual destructor serves as an anchor. -MachineTraceMetrics::Ensemble::~Ensemble() {} +MachineTraceMetrics::Ensemble::~Ensemble() = default; const MachineLoop* MachineTraceMetrics::Ensemble::getLoopFor(const MachineBasicBlock *MBB) const { @@ -297,6 +308,7 @@ static bool isExitingLoop(const MachineLoop *From, const MachineLoop *To) { // MinInstrCountEnsemble - Pick the trace that executes the least number of // instructions. namespace { + class MinInstrCountEnsemble : public MachineTraceMetrics::Ensemble { const char *getName() const override { return "MinInstr"; } const MachineBasicBlock *pickTracePred(const MachineBasicBlock*) override; @@ -306,7 +318,8 @@ public: MinInstrCountEnsemble(MachineTraceMetrics *mtm) : MachineTraceMetrics::Ensemble(mtm) {} }; -} + +} // end anonymous namespace // Select the preferred predecessor for MBB. const MachineBasicBlock* @@ -409,25 +422,30 @@ void MachineTraceMetrics::verifyAnalysis() const { // revisit blocks. namespace { + struct LoopBounds { MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> Blocks; SmallPtrSet<const MachineBasicBlock*, 8> Visited; const MachineLoopInfo *Loops; - bool Downward; + bool Downward = false; + LoopBounds(MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> blocks, - const MachineLoopInfo *loops) - : Blocks(blocks), Loops(loops), Downward(false) {} + const MachineLoopInfo *loops) : Blocks(blocks), Loops(loops) {} }; -} + +} // end anonymous namespace // Specialize po_iterator_storage in order to prune the post-order traversal so // it is limited to the current loop and doesn't traverse the loop back edges. namespace llvm { + template<> class po_iterator_storage<LoopBounds, true> { LoopBounds &LB; + public: po_iterator_storage(LoopBounds &lb) : LB(lb) {} + void finishPostorder(const MachineBasicBlock*) {} bool insertEdge(Optional<const MachineBasicBlock *> From, @@ -452,7 +470,8 @@ public: return LB.Visited.insert(To).second; } }; -} + +} // end namespace llvm /// Compute the trace through MBB. void MachineTraceMetrics::Ensemble::computeTrace(const MachineBasicBlock *MBB) { @@ -603,6 +622,7 @@ void MachineTraceMetrics::Ensemble::verify() const { // A data dependency is represented as a defining MI and operand numbers on the // defining and using MI. namespace { + struct DataDep { const MachineInstr *DefMI; unsigned DefOp; @@ -622,7 +642,8 @@ struct DataDep { assert((++DefI).atEnd() && "Register has multiple defs"); } }; -} + +} // end anonymous namespace // Get the input data dependencies that must be ready before UseMI can issue. // Return true if UseMI has any physreg operands. @@ -678,17 +699,19 @@ static void getPHIDeps(const MachineInstr &UseMI, // direction instructions are scanned, it could be the operand that defined the // regunit, or the highest operand to read the regunit. namespace { + struct LiveRegUnit { unsigned RegUnit; - unsigned Cycle; - const MachineInstr *MI; - unsigned Op; + unsigned Cycle = 0; + const MachineInstr *MI = nullptr; + unsigned Op = 0; unsigned getSparseSetIndex() const { return RegUnit; } - LiveRegUnit(unsigned RU) : RegUnit(RU), Cycle(0), MI(nullptr), Op(0) {} + LiveRegUnit(unsigned RU) : RegUnit(RU) {} }; -} + +} // end anonymous namespace // Identify physreg dependencies for UseMI, and update the live regunit // tracking set when scanning instructions downwards. @@ -922,7 +945,6 @@ static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height, return Height; } - typedef DenseMap<const MachineInstr *, unsigned> MIHeightMap; // Push the height of DefMI upwards if required to match UseMI. diff --git a/contrib/llvm/lib/CodeGen/MachineVerifier.cpp b/contrib/llvm/lib/CodeGen/MachineVerifier.cpp index a98139f..fcb5448 100644 --- a/contrib/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/contrib/llvm/lib/CodeGen/MachineVerifier.cpp @@ -23,7 +23,6 @@ // the verifier errors. //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SetOperations.h" @@ -36,6 +35,8 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instructions.h" @@ -87,7 +88,6 @@ namespace { RegSet regsLive; RegVector regsDefined, regsDead, regsKilled; RegMaskVector regMasks; - RegSet regsLiveInButUnused; SlotIndex lastIndex; @@ -188,8 +188,9 @@ namespace { return Reg < regsReserved.size() && regsReserved.test(Reg); } - bool isAllocatable(unsigned Reg) { - return Reg < TRI->getNumRegs() && MRI->isAllocatable(Reg); + bool isAllocatable(unsigned Reg) const { + return Reg < TRI->getNumRegs() && TRI->isInAllocatableClass(Reg) && + !regsReserved.test(Reg); } // Analysis information if available @@ -260,8 +261,8 @@ namespace { static char ID; // Pass ID, replacement for typeid const std::string Banner; - MachineVerifierPass(const std::string &banner = nullptr) - : MachineFunctionPass(ID), Banner(banner) { + MachineVerifierPass(std::string banner = std::string()) + : MachineFunctionPass(ID), Banner(std::move(banner)) { initializeMachineVerifierPassPass(*PassRegistry::getPassRegistry()); } @@ -418,7 +419,6 @@ unsigned MachineVerifier::verify(MachineFunction &MF) { regsDead.clear(); regsKilled.clear(); regMasks.clear(); - regsLiveInButUnused.clear(); MBBInfoMap.clear(); return foundErrors; @@ -526,9 +526,11 @@ void MachineVerifier::markReachable(const MachineBasicBlock *MBB) { void MachineVerifier::visitMachineFunctionBefore() { lastIndex = SlotIndex(); - regsReserved = MRI->getReservedRegs(); + regsReserved = MRI->reservedRegsFrozen() ? MRI->getReservedRegs() + : TRI->getReservedRegs(*MF); - markReachable(&MF->front()); + if (!MF->empty()) + markReachable(&MF->front()); // Build a set of the basic blocks in the function. FunctionBlocks.clear(); @@ -548,7 +550,8 @@ void MachineVerifier::visitMachineFunctionBefore() { // Check that the register use lists are sane. MRI->verifyUseLists(); - verifyStackFrame(); + if (!MF->empty()) + verifyStackFrame(); } // Does iterator point to a and b as the first two elements? @@ -572,7 +575,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { for (const auto &LI : MBB->liveins()) { if (isAllocatable(LI.PhysReg) && !MBB->isEHPad() && MBB->getIterator() != MBB->getParent()->begin()) { - report("MBB has allocable live-in, but isn't entry or landing-pad.", MBB); + report("MBB has allocatable live-in, but isn't entry or landing-pad.", MBB); } } } @@ -752,11 +755,10 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { regsLive.insert(*SubRegs); } } - regsLiveInButUnused = regsLive; const MachineFrameInfo &MFI = MF->getFrameInfo(); BitVector PR = MFI.getPristineRegs(*MF); - for (int I = PR.find_first(); I>0; I = PR.find_next(I)) { + for (unsigned I : PR.set_bits()) { for (MCSubRegIterator SubRegs(I, TRI, /*IncludeSelf=*/true); SubRegs.isValid(); ++SubRegs) regsLive.insert(*SubRegs); @@ -911,6 +913,39 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) { StringRef ErrorInfo; if (!TII->verifyInstruction(*MI, ErrorInfo)) report(ErrorInfo.data(), MI); + + // Verify properties of various specific instruction types + switch(MI->getOpcode()) { + default: + break; + case TargetOpcode::G_LOAD: + case TargetOpcode::G_STORE: + // Generic loads and stores must have a single MachineMemOperand + // describing that access. + if (!MI->hasOneMemOperand()) + report("Generic instruction accessing memory must have one mem operand", + MI); + break; + case TargetOpcode::STATEPOINT: + if (!MI->getOperand(StatepointOpers::IDPos).isImm() || + !MI->getOperand(StatepointOpers::NBytesPos).isImm() || + !MI->getOperand(StatepointOpers::NCallArgsPos).isImm()) + report("meta operands to STATEPOINT not constant!", MI); + break; + + auto VerifyStackMapConstant = [&](unsigned Offset) { + if (!MI->getOperand(Offset).isImm() || + MI->getOperand(Offset).getImm() != StackMaps::ConstantOp || + !MI->getOperand(Offset + 1).isImm()) + report("stack map constant to STATEPOINT not well formed!", MI); + }; + const unsigned VarStart = StatepointOpers(MI).getVarIdx(); + VerifyStackMapConstant(VarStart + StatepointOpers::CCOffset); + VerifyStackMapConstant(VarStart + StatepointOpers::FlagsOffset); + VerifyStackMapConstant(VarStart + StatepointOpers::NumDeoptOperandsOffset); + + // TODO: verify we have properly encoded deopt arguments + }; } void @@ -950,6 +985,14 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { report("Operand should be tied", MO, MONum); else if (unsigned(TiedTo) != MI->findTiedOperandIdx(MONum)) report("Tied def doesn't match MCInstrDesc", MO, MONum); + else if (TargetRegisterInfo::isPhysicalRegister(MO->getReg())) { + const MachineOperand &MOTied = MI->getOperand(TiedTo); + if (!MOTied.isReg()) + report("Tied counterpart must be a register", &MOTied, TiedTo); + else if (TargetRegisterInfo::isPhysicalRegister(MOTied.getReg()) && + MO->getReg() != MOTied.getReg()) + report("Tied physical registers must match.", &MOTied, TiedTo); + } } else if (MO->isReg() && MO->isTied()) report("Explicit operand should not be tied", MO, MONum); } else { @@ -1256,8 +1299,6 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) { // Both use and def operands can read a register. if (MO->readsReg()) { - regsLiveInButUnused.erase(Reg); - if (MO->isKill()) addRegWithSubRegs(regsKilled, Reg); @@ -1913,9 +1954,11 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, SlotIndex PEnd = LiveInts->getMBBEndIdx(*PI); const VNInfo *PVNI = LR.getVNInfoBefore(PEnd); - // All predecessors must have a live-out value if this is not a - // subregister liverange. - if (!PVNI && LaneMask.none()) { + // All predecessors must have a live-out value. However for a phi + // instruction with subregister intervals + // only one of the subregisters (not necessarily the current one) needs to + // be defined. + if (!PVNI && (LaneMask.none() || !IsPHI) ) { report("Register not marked live out of predecessor", *PI); report_context(LR, Reg, LaneMask); report_context(*VNI); @@ -2020,6 +2063,8 @@ namespace { void MachineVerifier::verifyStackFrame() { unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode(); unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); + if (FrameSetupOpcode == ~0u && FrameDestroyOpcode == ~0u) + return; SmallVector<StackStateOfBB, 8> SPState; SPState.resize(MF->getNumBlockIDs()); @@ -2047,23 +2092,14 @@ void MachineVerifier::verifyStackFrame() { // Update stack state by checking contents of MBB. for (const auto &I : *MBB) { if (I.getOpcode() == FrameSetupOpcode) { - // The first operand of a FrameOpcode should be i32. - int Size = I.getOperand(0).getImm(); - assert(Size >= 0 && - "Value should be non-negative in FrameSetup and FrameDestroy.\n"); - if (BBState.ExitIsSetup) report("FrameSetup is after another FrameSetup", &I); - BBState.ExitValue -= Size; + BBState.ExitValue -= TII->getFrameTotalSize(I); BBState.ExitIsSetup = true; } if (I.getOpcode() == FrameDestroyOpcode) { - // The first operand of a FrameOpcode should be i32. - int Size = I.getOperand(0).getImm(); - assert(Size >= 0 && - "Value should be non-negative in FrameSetup and FrameDestroy.\n"); - + int Size = TII->getFrameTotalSize(I); if (!BBState.ExitIsSetup) report("FrameDestroy is not after a FrameSetup", &I); int AbsSPAdj = BBState.ExitValue < 0 ? -BBState.ExitValue : diff --git a/contrib/llvm/lib/CodeGen/MacroFusion.cpp b/contrib/llvm/lib/CodeGen/MacroFusion.cpp new file mode 100644 index 0000000..633a853 --- /dev/null +++ b/contrib/llvm/lib/CodeGen/MacroFusion.cpp @@ -0,0 +1,153 @@ +//===- MacroFusion.cpp - Macro Fusion -------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains the implementation of the DAG scheduling mutation +/// to pair instructions back to back. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MacroFusion.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/ScheduleDAGMutation.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" + +#define DEBUG_TYPE "machine-scheduler" + +STATISTIC(NumFused, "Number of instr pairs fused"); + +using namespace llvm; + +static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden, + cl::desc("Enable scheduling for macro fusion."), cl::init(true)); + +static void fuseInstructionPair(ScheduleDAGMI &DAG, SUnit &FirstSU, + SUnit &SecondSU) { + // Create a single weak edge between the adjacent instrs. The only effect is + // to cause bottom-up scheduling to heavily prioritize the clustered instrs. + DAG.addEdge(&SecondSU, SDep(&FirstSU, SDep::Cluster)); + + // Adjust the latency between the anchor instr and its + // predecessors. + for (SDep &IDep : SecondSU.Preds) + if (IDep.getSUnit() == &FirstSU) + IDep.setLatency(0); + + // Adjust the latency between the dependent instr and its + // predecessors. + for (SDep &IDep : FirstSU.Succs) + if (IDep.getSUnit() == &SecondSU) + IDep.setLatency(0); + + DEBUG(dbgs() << DAG.MF.getName() << "(): Macro fuse "; + FirstSU.print(dbgs(), &DAG); dbgs() << " - "; + SecondSU.print(dbgs(), &DAG); dbgs() << " / "; + dbgs() << DAG.TII->getName(FirstSU.getInstr()->getOpcode()) << " - " << + DAG.TII->getName(SecondSU.getInstr()->getOpcode()) << '\n'; ); + + if (&SecondSU != &DAG.ExitSU) + // Make instructions dependent on FirstSU also dependent on SecondSU to + // prevent them from being scheduled between FirstSU and and SecondSU. + for (const SDep &SI : FirstSU.Succs) { + if (SI.getSUnit() == &SecondSU) + continue; + DEBUG(dbgs() << " Copy Succ "; + SI.getSUnit()->print(dbgs(), &DAG); dbgs() << '\n';); + DAG.addEdge(SI.getSUnit(), SDep(&SecondSU, SDep::Artificial)); + } + + ++NumFused; +} + +namespace { + +/// \brief Post-process the DAG to create cluster edges between instrs that may +/// be fused by the processor into a single operation. +class MacroFusion : public ScheduleDAGMutation { + ShouldSchedulePredTy shouldScheduleAdjacent; + bool FuseBlock; + bool scheduleAdjacentImpl(ScheduleDAGMI &DAG, SUnit &AnchorSU); + +public: + MacroFusion(ShouldSchedulePredTy shouldScheduleAdjacent, bool FuseBlock) + : shouldScheduleAdjacent(shouldScheduleAdjacent), FuseBlock(FuseBlock) {} + + void apply(ScheduleDAGInstrs *DAGInstrs) override; +}; + +} // end anonymous namespace + +void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { + ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); + + if (FuseBlock) + // For each of the SUnits in the scheduling block, try to fuse the instr in + // it with one in its predecessors. + for (SUnit &ISU : DAG->SUnits) + scheduleAdjacentImpl(*DAG, ISU); + + if (DAG->ExitSU.getInstr()) + // Try to fuse the instr in the ExitSU with one in its predecessors. + scheduleAdjacentImpl(*DAG, DAG->ExitSU); +} + +/// \brief Implement the fusion of instr pairs in the scheduling DAG, +/// anchored at the instr in AnchorSU.. +bool MacroFusion::scheduleAdjacentImpl(ScheduleDAGMI &DAG, SUnit &AnchorSU) { + const MachineInstr &AnchorMI = *AnchorSU.getInstr(); + const TargetInstrInfo &TII = *DAG.TII; + const TargetSubtargetInfo &ST = DAG.MF.getSubtarget(); + + // Check if the anchor instr may be fused. + if (!shouldScheduleAdjacent(TII, ST, nullptr, AnchorMI)) + return false; + + // Explorer for fusion candidates among the dependencies of the anchor instr. + for (SDep &Dep : AnchorSU.Preds) { + // Ignore dependencies that don't enforce ordering. + if (Dep.getKind() == SDep::Anti || Dep.getKind() == SDep::Output || + Dep.isWeak()) + continue; + + SUnit &DepSU = *Dep.getSUnit(); + if (DepSU.isBoundaryNode()) + continue; + + const MachineInstr *DepMI = DepSU.getInstr(); + if (!shouldScheduleAdjacent(TII, ST, DepMI, AnchorMI)) + continue; + + fuseInstructionPair(DAG, DepSU, AnchorSU); + return true; + } + + return false; +} + +std::unique_ptr<ScheduleDAGMutation> +llvm::createMacroFusionDAGMutation( + ShouldSchedulePredTy shouldScheduleAdjacent) { + if(EnableMacroFusion) + return llvm::make_unique<MacroFusion>(shouldScheduleAdjacent, true); + return nullptr; +} + +std::unique_ptr<ScheduleDAGMutation> +llvm::createBranchMacroFusionDAGMutation( + ShouldSchedulePredTy shouldScheduleAdjacent) { + if(EnableMacroFusion) + return llvm::make_unique<MacroFusion>(shouldScheduleAdjacent, false); + return nullptr; +} diff --git a/contrib/llvm/lib/CodeGen/OptimizePHIs.cpp b/contrib/llvm/lib/CodeGen/OptimizePHIs.cpp index 2a8531f..f7aeb42 100644 --- a/contrib/llvm/lib/CodeGen/OptimizePHIs.cpp +++ b/contrib/llvm/lib/CodeGen/OptimizePHIs.cpp @@ -12,18 +12,18 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/IR/Function.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; -#define DEBUG_TYPE "phi-opt" +#define DEBUG_TYPE "opt-phis" STATISTIC(NumPHICycles, "Number of PHI cycles replaced"); STATISTIC(NumDeadPHICycles, "Number of dead PHI cycles"); @@ -59,7 +59,7 @@ namespace { char OptimizePHIs::ID = 0; char &llvm::OptimizePHIsID = OptimizePHIs::ID; -INITIALIZE_PASS(OptimizePHIs, "opt-phis", +INITIALIZE_PASS(OptimizePHIs, DEBUG_TYPE, "Optimize machine instruction PHIs", false, false) bool OptimizePHIs::runOnMachineFunction(MachineFunction &Fn) { diff --git a/contrib/llvm/lib/CodeGen/PHIElimination.cpp b/contrib/llvm/lib/CodeGen/PHIElimination.cpp index c67a25b..9c898fa 100644 --- a/contrib/llvm/lib/CodeGen/PHIElimination.cpp +++ b/contrib/llvm/lib/CodeGen/PHIElimination.cpp @@ -34,7 +34,7 @@ #include <algorithm> using namespace llvm; -#define DEBUG_TYPE "phielim" +#define DEBUG_TYPE "phi-node-elimination" static cl::opt<bool> DisableEdgeSplitting("disable-phi-elim-edge-splitting", cl::init(false), @@ -112,11 +112,11 @@ STATISTIC(NumReused, "Number of reused lowered phis"); char PHIElimination::ID = 0; char& llvm::PHIEliminationID = PHIElimination::ID; -INITIALIZE_PASS_BEGIN(PHIElimination, "phi-node-elimination", +INITIALIZE_PASS_BEGIN(PHIElimination, DEBUG_TYPE, "Eliminate PHI nodes for register allocation", false, false) INITIALIZE_PASS_DEPENDENCY(LiveVariables) -INITIALIZE_PASS_END(PHIElimination, "phi-node-elimination", +INITIALIZE_PASS_END(PHIElimination, DEBUG_TYPE, "Eliminate PHI nodes for register allocation", false, false) void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const { diff --git a/contrib/llvm/lib/CodeGen/PatchableFunction.cpp b/contrib/llvm/lib/CodeGen/PatchableFunction.cpp index ad9166f..513e827 100644 --- a/contrib/llvm/lib/CodeGen/PatchableFunction.cpp +++ b/contrib/llvm/lib/CodeGen/PatchableFunction.cpp @@ -12,10 +12,10 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" @@ -75,7 +75,7 @@ bool PatchableFunction::runOnMachineFunction(MachineFunction &MF) { .addImm(FirstActualI->getOpcode()); for (auto &MO : FirstActualI->operands()) - MIB.addOperand(MO); + MIB.add(MO); FirstActualI->eraseFromParent(); MF.ensureAlignment(4); diff --git a/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp index 6d64345..b13f6b6 100644 --- a/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -66,7 +66,6 @@ // C = copy A <-- same-bank copy //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" @@ -77,8 +76,10 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -119,6 +120,14 @@ static cl::opt<unsigned> RewritePHILimit( "rewrite-phi-limit", cl::Hidden, cl::init(10), cl::desc("Limit the length of PHI chains to lookup")); +// Limit the length of recurrence chain when evaluating the benefit of +// commuting operands. +static cl::opt<unsigned> MaxRecurrenceChain( + "recurrence-chain-limit", cl::Hidden, cl::init(3), + cl::desc("Maximum length of recurrence chain when evaluating the benefit " + "of commuting operands")); + + STATISTIC(NumReuse, "Number of extension results reused"); STATISTIC(NumCmps, "Number of compares eliminated"); STATISTIC(NumImmFold, "Number of move immediate folded"); @@ -131,12 +140,14 @@ STATISTIC(NumNAPhysCopies, "Number of non-allocatable physical copies removed"); namespace { class ValueTrackerResult; + class RecurrenceInstr; class PeepholeOptimizer : public MachineFunctionPass { const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; MachineRegisterInfo *MRI; MachineDominatorTree *DT; // Machine dominator tree + MachineLoopInfo *MLI; public: static char ID; // Pass identification @@ -150,6 +161,8 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired<MachineLoopInfo>(); + AU.addPreserved<MachineLoopInfo>(); if (Aggressive) { AU.addRequired<MachineDominatorTree>(); AU.addPreserved<MachineDominatorTree>(); @@ -160,6 +173,9 @@ namespace { typedef SmallDenseMap<TargetInstrInfo::RegSubRegPair, ValueTrackerResult> RewriteMapTy; + /// \brief Sequence of instructions that formulate recurrence cycle. + typedef SmallVector<RecurrenceInstr, 4> RecurrenceCycle; + private: bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB); bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB, @@ -170,6 +186,7 @@ namespace { bool optimizeCoalescableCopy(MachineInstr *MI); bool optimizeUncoalescableCopy(MachineInstr *MI, SmallPtrSetImpl<MachineInstr *> &LocalMIs); + bool optimizeRecurrence(MachineInstr &PHI); bool findNextSource(unsigned Reg, unsigned SubReg, RewriteMapTy &RewriteMap); bool isMoveImmediate(MachineInstr *MI, @@ -178,6 +195,13 @@ namespace { bool foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB, SmallSet<unsigned, 4> &ImmDefRegs, DenseMap<unsigned, MachineInstr*> &ImmDefMIs); + /// \brief Finds recurrence cycles, but only ones that formulated around + /// a def operand and a use operand that are tied. If there is a use + /// operand commutable with the tied use operand, find recurrence cycle + /// along that operand as well. + bool findTargetRecurrence(unsigned Reg, + const SmallSet<unsigned, 2> &TargetReg, + RecurrenceCycle &RC); /// \brief If copy instruction \p MI is a virtual register copy, track it in /// the set \p CopySrcRegs and \p CopyMIs. If this virtual register was @@ -222,6 +246,28 @@ namespace { } }; + /// \brief Helper class to hold instructions that are inside recurrence + /// cycles. The recurrence cycle is formulated around 1) a def operand and its + /// tied use operand, or 2) a def operand and a use operand that is commutable + /// with another use operand which is tied to the def operand. In the latter + /// case, index of the tied use operand and the commutable use operand are + /// maintained with CommutePair. + class RecurrenceInstr { + public: + typedef std::pair<unsigned, unsigned> IndexPair; + + RecurrenceInstr(MachineInstr *MI) : MI(MI) {} + RecurrenceInstr(MachineInstr *MI, unsigned Idx1, unsigned Idx2) + : MI(MI), CommutePair(std::make_pair(Idx1, Idx2)) {} + + MachineInstr *getMI() const { return MI; } + Optional<IndexPair> getCommutePair() const { return CommutePair; } + + private: + MachineInstr *MI; + Optional<IndexPair> CommutePair; + }; + /// \brief Helper class to hold a reply for ValueTracker queries. Contains the /// returned sources for a given search and the instructions where the sources /// were tracked from. @@ -412,6 +458,7 @@ char &llvm::PeepholeOptimizerID = PeepholeOptimizer::ID; INITIALIZE_PASS_BEGIN(PeepholeOptimizer, DEBUG_TYPE, "Peephole Optimizations", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_END(PeepholeOptimizer, DEBUG_TYPE, "Peephole Optimizations", false, false) @@ -1487,6 +1534,113 @@ bool PeepholeOptimizer::foldRedundantNAPhysCopy( return false; } +/// \bried Returns true if \p MO is a virtual register operand. +static bool isVirtualRegisterOperand(MachineOperand &MO) { + if (!MO.isReg()) + return false; + return TargetRegisterInfo::isVirtualRegister(MO.getReg()); +} + +bool PeepholeOptimizer::findTargetRecurrence( + unsigned Reg, const SmallSet<unsigned, 2> &TargetRegs, + RecurrenceCycle &RC) { + // Recurrence found if Reg is in TargetRegs. + if (TargetRegs.count(Reg)) + return true; + + // TODO: Curerntly, we only allow the last instruction of the recurrence + // cycle (the instruction that feeds the PHI instruction) to have more than + // one uses to guarantee that commuting operands does not tie registers + // with overlapping live range. Once we have actual live range info of + // each register, this constraint can be relaxed. + if (!MRI->hasOneNonDBGUse(Reg)) + return false; + + // Give up if the reccurrence chain length is longer than the limit. + if (RC.size() >= MaxRecurrenceChain) + return false; + + MachineInstr &MI = *(MRI->use_instr_nodbg_begin(Reg)); + unsigned Idx = MI.findRegisterUseOperandIdx(Reg); + + // Only interested in recurrences whose instructions have only one def, which + // is a virtual register. + if (MI.getDesc().getNumDefs() != 1) + return false; + + MachineOperand &DefOp = MI.getOperand(0); + if (!isVirtualRegisterOperand(DefOp)) + return false; + + // Check if def operand of MI is tied to any use operand. We are only + // interested in the case that all the instructions in the recurrence chain + // have there def operand tied with one of the use operand. + unsigned TiedUseIdx; + if (!MI.isRegTiedToUseOperand(0, &TiedUseIdx)) + return false; + + if (Idx == TiedUseIdx) { + RC.push_back(RecurrenceInstr(&MI)); + return findTargetRecurrence(DefOp.getReg(), TargetRegs, RC); + } else { + // If Idx is not TiedUseIdx, check if Idx is commutable with TiedUseIdx. + unsigned CommIdx = TargetInstrInfo::CommuteAnyOperandIndex; + if (TII->findCommutedOpIndices(MI, Idx, CommIdx) && CommIdx == TiedUseIdx) { + RC.push_back(RecurrenceInstr(&MI, Idx, CommIdx)); + return findTargetRecurrence(DefOp.getReg(), TargetRegs, RC); + } + } + + return false; +} + +/// \brief Phi instructions will eventually be lowered to copy instructions. If +/// phi is in a loop header, a recurrence may formulated around the source and +/// destination of the phi. For such case commuting operands of the instructions +/// in the recurrence may enable coalescing of the copy instruction generated +/// from the phi. For example, if there is a recurrence of +/// +/// LoopHeader: +/// %vreg1 = phi(%vreg0, %vreg100) +/// LoopLatch: +/// %vreg0<def, tied1> = ADD %vreg2<def, tied0>, %vreg1 +/// +/// , the fact that vreg0 and vreg2 are in the same tied operands set makes +/// the coalescing of copy instruction generated from the phi in +/// LoopHeader(i.e. %vreg1 = COPY %vreg0) impossible, because %vreg1 and +/// %vreg2 have overlapping live range. This introduces additional move +/// instruction to the final assembly. However, if we commute %vreg2 and +/// %vreg1 of ADD instruction, the redundant move instruction can be +/// avoided. +bool PeepholeOptimizer::optimizeRecurrence(MachineInstr &PHI) { + SmallSet<unsigned, 2> TargetRegs; + for (unsigned Idx = 1; Idx < PHI.getNumOperands(); Idx += 2) { + MachineOperand &MO = PHI.getOperand(Idx); + assert(isVirtualRegisterOperand(MO) && "Invalid PHI instruction"); + TargetRegs.insert(MO.getReg()); + } + + bool Changed = false; + RecurrenceCycle RC; + if (findTargetRecurrence(PHI.getOperand(0).getReg(), TargetRegs, RC)) { + // Commutes operands of instructions in RC if necessary so that the copy to + // be generated from PHI can be coalesced. + DEBUG(dbgs() << "Optimize recurrence chain from " << PHI); + for (auto &RI : RC) { + DEBUG(dbgs() << "\tInst: " << *(RI.getMI())); + auto CP = RI.getCommutePair(); + if (CP) { + Changed = true; + TII->commuteInstruction(*(RI.getMI()), false, (*CP).first, + (*CP).second); + DEBUG(dbgs() << "\t\tCommuted: " << *(RI.getMI())); + } + } + } + + return Changed; +} + bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) return false; @@ -1501,6 +1655,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { TRI = MF.getSubtarget().getRegisterInfo(); MRI = &MF.getRegInfo(); DT = Aggressive ? &getAnalysis<MachineDominatorTree>() : nullptr; + MLI = &getAnalysis<MachineLoopInfo>(); bool Changed = false; @@ -1529,6 +1684,8 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { SmallSet<unsigned, 4> CopySrcRegs; DenseMap<unsigned, MachineInstr *> CopySrcMIs; + bool IsLoopHeader = MLI->isLoopHeader(&MBB); + for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end(); MII != MIE; ) { MachineInstr *MI = &*MII; @@ -1540,9 +1697,16 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { if (MI->isDebugValue()) continue; - if (MI->isPosition() || MI->isPHI()) + if (MI->isPosition()) continue; + if (IsLoopHeader && MI->isPHI()) { + if (optimizeRecurrence(*MI)) { + Changed = true; + continue; + } + } + if (!MI->isCopy()) { for (const auto &Op : MI->operands()) { // Visit all operands: definitions can be implicit or explicit. @@ -1667,7 +1831,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { MRI->markUsesInDebugValueAsUndef(FoldedReg); FoldAsLoadDefCandidates.erase(FoldedReg); ++NumLoadFold; - + // MI is replaced with FoldMI so we can continue trying to fold Changed = true; MI = FoldMI; @@ -1675,7 +1839,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { } } } - + // If we run into an instruction we can't fold across, discard // the load candidates. Note: We might be able to fold *into* this // instruction, so this needs to be after the folding logic. diff --git a/contrib/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp b/contrib/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp index 5bc5f75..4a50d89 100644 --- a/contrib/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp +++ b/contrib/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp @@ -23,13 +23,13 @@ /// This pass traverses all the instructions in a program in top-down order. /// In contrast to the instruction scheduling passes, this pass never resets /// the hazard recognizer to ensure it can correctly handles noop hazards at -/// the begining of blocks. +/// the beginning of blocks. // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/ScheduleHazardRecognizer.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" diff --git a/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp b/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp index 6081916..f2249f9 100644 --- a/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp +++ b/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp @@ -200,7 +200,7 @@ namespace { char &llvm::PostRASchedulerID = PostRAScheduler::ID; -INITIALIZE_PASS(PostRAScheduler, "post-RA-sched", +INITIALIZE_PASS(PostRAScheduler, DEBUG_TYPE, "Post RA top-down list latency scheduler", false, false) SchedulePostRATDList::SchedulePostRATDList( @@ -253,7 +253,7 @@ void SchedulePostRATDList::exitRegion() { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// dumpSchedule - dump the scheduled Sequence. -void SchedulePostRATDList::dumpSchedule() const { +LLVM_DUMP_METHOD void SchedulePostRATDList::dumpSchedule() const { for (unsigned i = 0, e = Sequence.size(); i != e; i++) { if (SUnit *SU = Sequence[i]) SU->dump(this); @@ -367,7 +367,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) { Scheduler.finishBlock(); // Update register kills - Scheduler.fixupKills(&MBB); + Scheduler.fixupKills(MBB); } return true; diff --git a/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp b/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp index d27ea2f..0118580 100644 --- a/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp +++ b/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp @@ -20,7 +20,7 @@ using namespace llvm; -#define DEBUG_TYPE "processimplicitdefs" +#define DEBUG_TYPE "processimpdefs" namespace { /// Process IMPLICIT_DEF instructions and make sure there is one implicit_def @@ -51,9 +51,7 @@ public: char ProcessImplicitDefs::ID = 0; char &llvm::ProcessImplicitDefsID = ProcessImplicitDefs::ID; -INITIALIZE_PASS_BEGIN(ProcessImplicitDefs, "processimpdefs", - "Process Implicit Definitions", false, false) -INITIALIZE_PASS_END(ProcessImplicitDefs, "processimpdefs", +INITIALIZE_PASS(ProcessImplicitDefs, DEBUG_TYPE, "Process Implicit Definitions", false, false) void ProcessImplicitDefs::getAnalysisUsage(AnalysisUsage &AU) const { diff --git a/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp index 5fca7fa..e9f8d43 100644 --- a/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -45,7 +45,7 @@ using namespace llvm; -#define DEBUG_TYPE "pei" +#define DEBUG_TYPE "prologepilog" typedef SmallVector<MachineBasicBlock *, 4> MBBVector; static void doSpillCalleeSavedRegs(MachineFunction &MF, RegScavenger *RS, @@ -54,25 +54,12 @@ static void doSpillCalleeSavedRegs(MachineFunction &MF, RegScavenger *RS, const MBBVector &SaveBlocks, const MBBVector &RestoreBlocks); -static void doScavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger *RS); - namespace { class PEI : public MachineFunctionPass { public: static char ID; - explicit PEI(const TargetMachine *TM = nullptr) : MachineFunctionPass(ID) { + PEI() : MachineFunctionPass(ID) { initializePEIPass(*PassRegistry::getPassRegistry()); - - if (TM && (!TM->usesPhysRegsForPEI())) { - SpillCalleeSavedRegisters = [](MachineFunction &, RegScavenger *, - unsigned &, unsigned &, const MBBVector &, - const MBBVector &) {}; - ScavengeFrameVirtualRegs = [](MachineFunction &, RegScavenger *) {}; - } else { - SpillCalleeSavedRegisters = doSpillCalleeSavedRegs; - ScavengeFrameVirtualRegs = doScavengeFrameVirtualRegs; - UsesCalleeSaves = true; - } } void getAnalysisUsage(AnalysisUsage &AU) const override; @@ -95,7 +82,7 @@ private: const MBBVector &SaveBlocks, const MBBVector &RestoreBlocks)> SpillCalleeSavedRegisters; - std::function<void(MachineFunction &MF, RegScavenger *RS)> + std::function<void(MachineFunction &MF, RegScavenger &RS)> ScavengeFrameVirtualRegs; bool UsesCalleeSaves = false; @@ -140,21 +127,19 @@ WarnStackSize("warn-stack-size", cl::Hidden, cl::init((unsigned)-1), cl::desc("Warn for stack size bigger than the given" " number")); -INITIALIZE_TM_PASS_BEGIN(PEI, "prologepilog", "Prologue/Epilogue Insertion", - false, false) +INITIALIZE_PASS_BEGIN(PEI, DEBUG_TYPE, "Prologue/Epilogue Insertion", false, + false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(StackProtector) -INITIALIZE_TM_PASS_END(PEI, "prologepilog", - "Prologue/Epilogue Insertion & Frame Finalization", - false, false) +INITIALIZE_PASS_END(PEI, DEBUG_TYPE, + "Prologue/Epilogue Insertion & Frame Finalization", false, + false) -MachineFunctionPass * -llvm::createPrologEpilogInserterPass(const TargetMachine *TM) { - return new PEI(TM); +MachineFunctionPass *llvm::createPrologEpilogInserterPass() { + return new PEI(); } -STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged"); STATISTIC(NumBytesStackSpace, "Number of bytes used for stack in all functions"); @@ -174,6 +159,20 @@ typedef SmallSetVector<int, 8> StackObjSet; /// frame indexes with appropriate references. /// bool PEI::runOnMachineFunction(MachineFunction &Fn) { + if (!SpillCalleeSavedRegisters) { + const TargetMachine &TM = Fn.getTarget(); + if (!TM.usesPhysRegsForPEI()) { + SpillCalleeSavedRegisters = [](MachineFunction &, RegScavenger *, + unsigned &, unsigned &, const MBBVector &, + const MBBVector &) {}; + ScavengeFrameVirtualRegs = [](MachineFunction &, RegScavenger &) {}; + } else { + SpillCalleeSavedRegisters = doSpillCalleeSavedRegs; + ScavengeFrameVirtualRegs = scavengeFrameVirtualRegs; + UsesCalleeSaves = true; + } + } + const Function* F = Fn.getFunction(); const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo(); const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering(); @@ -220,7 +219,7 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) { // post-pass, scavenge the virtual registers that frame index elimination // inserted. if (TRI->requiresRegisterScavenging(Fn) && FrameIndexVirtualScavenging) { - ScavengeFrameVirtualRegs(Fn, RS); + ScavengeFrameVirtualRegs(Fn, *RS); // Clear any vregs created by virtual scavenging. Fn.getRegInfo().clearVirtRegs(); @@ -265,11 +264,8 @@ void PEI::calculateCallFrameInfo(MachineFunction &Fn) { std::vector<MachineBasicBlock::iterator> FrameSDOps; for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) - if (I->getOpcode() == FrameSetupOpcode || - I->getOpcode() == FrameDestroyOpcode) { - assert(I->getNumOperands() >= 1 && "Call Frame Setup/Destroy Pseudo" - " instructions should have a single immediate argument!"); - unsigned Size = I->getOperand(0).getImm(); + if (TII.isFrameInstr(*I)) { + unsigned Size = TII.getFrameSize(*I); if (Size > MaxCallFrameSize) MaxCallFrameSize = Size; AdjustsStack = true; FrameSDOps.push_back(I); @@ -280,6 +276,9 @@ void PEI::calculateCallFrameInfo(MachineFunction &Fn) { AdjustsStack = true; } + assert(!MFI.isMaxCallFrameSizeComputed() || + (MFI.getMaxCallFrameSize() == MaxCallFrameSize && + MFI.adjustsStack() == AdjustsStack)); MFI.setAdjustsStack(AdjustsStack); MFI.setMaxCallFrameSize(MaxCallFrameSize); @@ -336,7 +335,7 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F, return; const TargetRegisterInfo *RegInfo = F.getSubtarget().getRegisterInfo(); - const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&F); + const MCPhysReg *CSRegs = F.getRegInfo().getCalleeSavedRegs(); std::vector<CalleeSavedInfo> CSI; for (unsigned i = 0; CSRegs[i]; ++i) { @@ -376,22 +375,22 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F, FixedSlot->Reg != Reg) ++FixedSlot; + unsigned Size = RegInfo->getSpillSize(*RC); if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) { // Nope, just spill it anywhere convenient. - unsigned Align = RC->getAlignment(); + unsigned Align = RegInfo->getSpillAlignment(*RC); unsigned StackAlign = TFI->getStackAlignment(); // We may not be able to satisfy the desired alignment specification of // the TargetRegisterClass if the stack alignment is smaller. Use the // min. Align = std::min(Align, StackAlign); - FrameIdx = MFI.CreateStackObject(RC->getSize(), Align, true); + FrameIdx = MFI.CreateStackObject(Size, Align, true); if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx; if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx; } else { // Spill it to the stack where we must. - FrameIdx = - MFI.CreateFixedSpillStackObject(RC->getSize(), FixedSlot->Offset); + FrameIdx = MFI.CreateFixedSpillStackObject(Size, FixedSlot->Offset); } CS.setFrameIdx(FrameIdx); @@ -448,12 +447,13 @@ static void updateLiveness(MachineFunction &MF) { const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); for (unsigned i = 0, e = CSI.size(); i != e; ++i) { for (MachineBasicBlock *MBB : Visited) { MCPhysReg Reg = CSI[i].getReg(); // Add the callee-saved register as live-in. // It's killed at the spill. - if (!MBB->isLiveIn(Reg)) + if (!MRI.isReserved(Reg) && !MBB->isLiveIn(Reg)) MBB->addLiveIn(Reg); } } @@ -764,6 +764,9 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) { } else if (MaxCSFrameIndex >= MinCSFrameIndex) { // Be careful about underflow in comparisons agains MinCSFrameIndex. for (unsigned i = MaxCSFrameIndex; i != MinCSFrameIndex - 1; --i) { + if (MFI.isDeadObjectIndex(i)) + continue; + unsigned Align = MFI.getObjectAlignment(i); // Adjust to alignment boundary Offset = alignTo(Offset, Align, Skew); @@ -1049,8 +1052,6 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn, const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo(); const TargetRegisterInfo &TRI = *Fn.getSubtarget().getRegisterInfo(); const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering(); - unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode(); - unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode(); if (RS && FrameIndexEliminationScavenging) RS->enterBasicBlock(*BB); @@ -1059,11 +1060,9 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn, for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) { - if (I->getOpcode() == FrameSetupOpcode || - I->getOpcode() == FrameDestroyOpcode) { - InsideCallSequence = (I->getOpcode() == FrameSetupOpcode); + if (TII.isFrameInstr(*I)) { + InsideCallSequence = TII.isFrameSetup(*I); SPAdj += TII.getSPAdjust(*I); - I = TFI->eliminateCallFramePseudoInstr(Fn, *BB, I); continue; } @@ -1151,90 +1150,3 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn, RS->forward(MI); } } - -/// doScavengeFrameVirtualRegs - Replace all frame index virtual registers -/// with physical registers. Use the register scavenger to find an -/// appropriate register to use. -/// -/// FIXME: Iterating over the instruction stream is unnecessary. We can simply -/// iterate over the vreg use list, which at this point only contains machine -/// operands for which eliminateFrameIndex need a new scratch reg. -static void -doScavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger *RS) { - // Run through the instructions and find any virtual registers. - MachineRegisterInfo &MRI = MF.getRegInfo(); - for (MachineBasicBlock &MBB : MF) { - RS->enterBasicBlock(MBB); - - int SPAdj = 0; - - // The instruction stream may change in the loop, so check MBB.end() - // directly. - for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) { - // We might end up here again with a NULL iterator if we scavenged a - // register for which we inserted spill code for definition by what was - // originally the first instruction in MBB. - if (I == MachineBasicBlock::iterator(nullptr)) - I = MBB.begin(); - - const MachineInstr &MI = *I; - MachineBasicBlock::iterator J = std::next(I); - MachineBasicBlock::iterator P = - I == MBB.begin() ? MachineBasicBlock::iterator(nullptr) - : std::prev(I); - - // RS should process this instruction before we might scavenge at this - // location. This is because we might be replacing a virtual register - // defined by this instruction, and if so, registers killed by this - // instruction are available, and defined registers are not. - RS->forward(I); - - for (const MachineOperand &MO : MI.operands()) { - if (!MO.isReg()) - continue; - unsigned Reg = MO.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(Reg)) - continue; - - // When we first encounter a new virtual register, it - // must be a definition. - assert(MO.isDef() && "frame index virtual missing def!"); - // Scavenge a new scratch register - const TargetRegisterClass *RC = MRI.getRegClass(Reg); - unsigned ScratchReg = RS->scavengeRegister(RC, J, SPAdj); - - ++NumScavengedRegs; - - // Replace this reference to the virtual register with the - // scratch register. - assert(ScratchReg && "Missing scratch register!"); - MRI.replaceRegWith(Reg, ScratchReg); - - // Because this instruction was processed by the RS before this - // register was allocated, make sure that the RS now records the - // register as being used. - RS->setRegUsed(ScratchReg); - } - - // If the scavenger needed to use one of its spill slots, the - // spill code will have been inserted in between I and J. This is a - // problem because we need the spill code before I: Move I to just - // prior to J. - if (I != std::prev(J)) { - MBB.splice(J, &MBB, I); - - // Before we move I, we need to prepare the RS to visit I again. - // Specifically, RS will assert if it sees uses of registers that - // it believes are undefined. Because we have already processed - // register kills in I, when it visits I again, it will believe that - // those registers are undefined. To avoid this situation, unprocess - // the instruction I. - assert(RS->getCurrentPosition() == I && - "The register scavenger has an unexpected position"); - I = P; - RS->unprocess(P); - } else - ++I; - } - } -} diff --git a/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp b/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp index 804a4c3..b29e62b 100644 --- a/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp +++ b/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp @@ -29,7 +29,10 @@ PseudoSourceValue::PseudoSourceValue(PSVKind Kind) : Kind(Kind) {} PseudoSourceValue::~PseudoSourceValue() {} void PseudoSourceValue::printCustom(raw_ostream &O) const { - O << PSVNames[Kind]; + if (Kind < TargetCustom) + O << PSVNames[Kind]; + else + O << "TargetCustom" << Kind; } bool PseudoSourceValue::isConstant(const MachineFrameInfo *) const { diff --git a/contrib/llvm/lib/CodeGen/RegAllocBase.cpp b/contrib/llvm/lib/CodeGen/RegAllocBase.cpp index fb49a93..7b4fbac 100644 --- a/contrib/llvm/lib/CodeGen/RegAllocBase.cpp +++ b/contrib/llvm/lib/CodeGen/RegAllocBase.cpp @@ -21,13 +21,12 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/VirtRegMap.h" -#include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Support/Timer.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetRegisterInfo.h" using namespace llvm; @@ -134,18 +133,19 @@ void RegAllocBase::allocatePhysRegs() { if (AvailablePhysReg) Matrix->assign(*VirtReg, AvailablePhysReg); - for (VirtRegVec::iterator I = SplitVRegs.begin(), E = SplitVRegs.end(); - I != E; ++I) { - LiveInterval *SplitVirtReg = &LIS->getInterval(*I); + for (unsigned Reg : SplitVRegs) { + assert(LIS->hasInterval(Reg)); + + LiveInterval *SplitVirtReg = &LIS->getInterval(Reg); assert(!VRM->hasPhys(SplitVirtReg->reg) && "Register already assigned"); if (MRI->reg_nodbg_empty(SplitVirtReg->reg)) { + assert(SplitVirtReg->empty() && "Non-empty but used interval"); DEBUG(dbgs() << "not queueing unused " << *SplitVirtReg << '\n'); aboutToRemoveInterval(*SplitVirtReg); LIS->removeInterval(SplitVirtReg->reg); continue; } DEBUG(dbgs() << "queuing new interval: " << *SplitVirtReg << "\n"); - assert(!SplitVirtReg->empty() && "expecting non-empty interval"); assert(TargetRegisterInfo::isVirtualRegister(SplitVirtReg->reg) && "expect split value in virtual register"); enqueue(SplitVirtReg); diff --git a/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp b/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp index a558e37..7743061 100644 --- a/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp +++ b/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp @@ -12,7 +12,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "AllocationOrder.h" #include "LiveDebugVariables.h" #include "RegAllocBase.h" @@ -28,6 +27,7 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/VirtRegMap.h" #include "llvm/PassAnalysisSupport.h" @@ -58,8 +58,9 @@ namespace { /// whenever a register is unavailable. This is not practical in production but /// provides a useful baseline both for measuring other allocators and comparing /// the speed of the basic algorithm against other styles of allocators. -class RABasic : public MachineFunctionPass, public RegAllocBase -{ +class RABasic : public MachineFunctionPass, + public RegAllocBase, + private LiveRangeEdit::Delegate { // context MachineFunction *MF; @@ -72,6 +73,9 @@ class RABasic : public MachineFunctionPass, public RegAllocBase // selectOrSplit(). BitVector UsableRegs; + bool LRE_CanEraseVirtReg(unsigned) override; + void LRE_WillShrinkVirtReg(unsigned) override; + public: RABasic(); @@ -121,17 +125,46 @@ char RABasic::ID = 0; } // end anonymous namespace +char &llvm::RABasicID = RABasic::ID; + +INITIALIZE_PASS_BEGIN(RABasic, "regallocbasic", "Basic Register Allocator", + false, false) +INITIALIZE_PASS_DEPENDENCY(LiveDebugVariables) +INITIALIZE_PASS_DEPENDENCY(SlotIndexes) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(RegisterCoalescer) +INITIALIZE_PASS_DEPENDENCY(MachineScheduler) +INITIALIZE_PASS_DEPENDENCY(LiveStacks) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) +INITIALIZE_PASS_END(RABasic, "regallocbasic", "Basic Register Allocator", false, + false) + +bool RABasic::LRE_CanEraseVirtReg(unsigned VirtReg) { + if (VRM->hasPhys(VirtReg)) { + LiveInterval &LI = LIS->getInterval(VirtReg); + Matrix->unassign(LI); + aboutToRemoveInterval(LI); + return true; + } + // Unassigned virtreg is probably in the priority queue. + // RegAllocBase will erase it after dequeueing. + return false; +} + +void RABasic::LRE_WillShrinkVirtReg(unsigned VirtReg) { + if (!VRM->hasPhys(VirtReg)) + return; + + // Register is assigned, put it back on the queue for reassignment. + LiveInterval &LI = LIS->getInterval(VirtReg); + Matrix->unassign(LI); + enqueue(&LI); +} + RABasic::RABasic(): MachineFunctionPass(ID) { - initializeLiveDebugVariablesPass(*PassRegistry::getPassRegistry()); - initializeLiveIntervalsPass(*PassRegistry::getPassRegistry()); - initializeSlotIndexesPass(*PassRegistry::getPassRegistry()); - initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry()); - initializeMachineSchedulerPass(*PassRegistry::getPassRegistry()); - initializeLiveStacksPass(*PassRegistry::getPassRegistry()); - initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry()); - initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry()); - initializeVirtRegMapPass(*PassRegistry::getPassRegistry()); - initializeLiveRegMatrixPass(*PassRegistry::getPassRegistry()); } void RABasic::getAnalysisUsage(AnalysisUsage &AU) const { @@ -176,8 +209,6 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg, for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units); Q.collectInterferingVRegs(); - if (Q.seenUnspillableVReg()) - return false; for (unsigned i = Q.interferingVRegs().size(); i; --i) { LiveInterval *Intf = Q.interferingVRegs()[i - 1]; if (!Intf->isSpillable() || Intf->weight > VirtReg.weight) @@ -202,7 +233,7 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg, Matrix->unassign(Spill); // Spill the extracted interval. - LiveRangeEdit LRE(&Spill, SplitVRegs, *MF, *LIS, VRM, nullptr, &DeadRemats); + LiveRangeEdit LRE(&Spill, SplitVRegs, *MF, *LIS, VRM, this, &DeadRemats); spiller().spill(LRE); } return true; @@ -261,7 +292,7 @@ unsigned RABasic::selectOrSplit(LiveInterval &VirtReg, DEBUG(dbgs() << "spilling: " << VirtReg << '\n'); if (!VirtReg.isSpillable()) return ~0u; - LiveRangeEdit LRE(&VirtReg, SplitVRegs, *MF, *LIS, VRM, nullptr, &DeadRemats); + LiveRangeEdit LRE(&VirtReg, SplitVRegs, *MF, *LIS, VRM, this, &DeadRemats); spiller().spill(LRE); // The live virtual register requesting allocation was spilled, so tell diff --git a/contrib/llvm/lib/CodeGen/RegAllocFast.cpp b/contrib/llvm/lib/CodeGen/RegAllocFast.cpp index fd759bc..d5538be 100644 --- a/contrib/llvm/lib/CodeGen/RegAllocFast.cpp +++ b/contrib/llvm/lib/CodeGen/RegAllocFast.cpp @@ -203,6 +203,8 @@ namespace { char RAFast::ID = 0; } +INITIALIZE_PASS(RAFast, "regallocfast", "Fast Register Allocator", false, false) + /// getStackSpaceFor - This allocates space for the specified virtual register /// to be held on the stack. int RAFast::getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC) { @@ -212,8 +214,9 @@ int RAFast::getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC) { return SS; // Already has space allocated? // Allocate a new stack object for this spill location... - int FrameIdx = MF->getFrameInfo().CreateSpillStackObject(RC->getSize(), - RC->getAlignment()); + unsigned Size = TRI->getSpillSize(*RC); + unsigned Align = TRI->getSpillAlignment(*RC); + int FrameIdx = MF->getFrameInfo().CreateSpillStackObject(Size, Align); // Assign the slot. StackSlotForVirtReg[VirtReg] = FrameIdx; @@ -243,8 +246,15 @@ void RAFast::addKillFlag(const LiveReg &LR) { if (MO.isUse() && !LR.LastUse->isRegTiedToDefOperand(LR.LastOpNum)) { if (MO.getReg() == LR.PhysReg) MO.setIsKill(); - else - LR.LastUse->addRegisterKilled(LR.PhysReg, TRI, true); + // else, don't do anything we are problably redefining a + // subreg of this register and given we don't track which + // lanes are actually dead, we cannot insert a kill flag here. + // Otherwise we may end up in a situation like this: + // ... = (MO) physreg:sub1, physreg <implicit-use, kill> + // ... <== Here we would allow later pass to reuse physreg:sub1 + // which is potentially wrong. + // LR:sub0 = ... + // ... = LR.sub1 <== This is going to use physreg:sub1 } } @@ -304,19 +314,7 @@ void RAFast::spillVirtReg(MachineBasicBlock::iterator MI, LiveDbgValueMap[LRI->VirtReg]; for (unsigned li = 0, le = LRIDbgValues.size(); li != le; ++li) { MachineInstr *DBG = LRIDbgValues[li]; - const MDNode *Var = DBG->getDebugVariable(); - const MDNode *Expr = DBG->getDebugExpression(); - bool IsIndirect = DBG->isIndirectDebugValue(); - uint64_t Offset = IsIndirect ? DBG->getOperand(1).getImm() : 0; - DebugLoc DL = DBG->getDebugLoc(); - assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) && - "Expected inlined-at fields to agree"); - MachineInstr *NewDV = - BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::DBG_VALUE)) - .addFrameIndex(FI) - .addImm(Offset) - .addMetadata(Var) - .addMetadata(Expr); + MachineInstr *NewDV = buildDbgValueForSpill(*MBB, MI, *DBG, FI); assert(NewDV->getParent() == MBB && "dangling parent pointer"); (void)NewDV; DEBUG(dbgs() << "Inserting debug info due to spill:" << "\n" << *NewDV); diff --git a/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp b/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp index c47cfb1..020e81e 100644 --- a/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -1,4 +1,4 @@ -//===-- RegAllocGreedy.cpp - greedy register allocator --------------------===// +//===- RegAllocGreedy.cpp - greedy register allocator ---------------------===// // // The LLVM Compiler Infrastructure // @@ -19,34 +19,63 @@ #include "SpillPlacement.h" #include "Spiller.h" #include "SplitKit.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/IndexedMap.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h" #include "llvm/CodeGen/CalcSpillWeights.h" #include "llvm/CodeGen/EdgeBundles.h" +#include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveIntervalUnion.h" #include "llvm/CodeGen/LiveRangeEdit.h" #include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/LiveStackAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" -#include "llvm/PassAnalysisSupport.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" +#include "llvm/Support/BlockFrequency.h" #include "llvm/Support/BranchProbability.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/Timer.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <memory> #include <queue> +#include <tuple> +#include <utility> using namespace llvm; @@ -104,13 +133,14 @@ static RegisterRegAlloc greedyRegAlloc("greedy", "greedy register allocator", createGreedyRegisterAllocator); namespace { + class RAGreedy : public MachineFunctionPass, public RegAllocBase, private LiveRangeEdit::Delegate { // Convenient shortcuts. - typedef std::priority_queue<std::pair<unsigned, unsigned> > PQueue; - typedef SmallPtrSet<LiveInterval *, 4> SmallLISet; - typedef SmallSet<unsigned, 16> SmallVirtRegSet; + using PQueue = std::priority_queue<std::pair<unsigned, unsigned>>; + using SmallLISet = SmallPtrSet<LiveInterval *, 4>; + using SmallVirtRegSet = SmallSet<unsigned, 16>; // context MachineFunction *MF; @@ -125,6 +155,7 @@ class RAGreedy : public MachineFunctionPass, MachineBlockFrequencyInfo *MBFI; MachineDominatorTree *DomTree; MachineLoopInfo *Loops; + MachineOptimizationRemarkEmitter *ORE; EdgeBundles *Bundles; SpillPlacement *SpillPlacer; LiveDebugVariables *DebugVars; @@ -198,12 +229,12 @@ class RAGreedy : public MachineFunctionPass, // RegInfo - Keep additional information about each live range. struct RegInfo { - LiveRangeStage Stage; + LiveRangeStage Stage = RS_New; // Cascade - Eviction loop prevention. See canEvictInterference(). - unsigned Cascade; + unsigned Cascade = 0; - RegInfo() : Stage(RS_New), Cascade(0) {} + RegInfo() = default; }; IndexedMap<RegInfo, VirtReg2IndexFunctor> ExtraRegInfo; @@ -229,10 +260,10 @@ class RAGreedy : public MachineFunctionPass, /// Cost of evicting interference. struct EvictionCost { - unsigned BrokenHints; ///< Total number of broken hints. - float MaxWeight; ///< Maximum spill weight evicted. + unsigned BrokenHints = 0; ///< Total number of broken hints. + float MaxWeight = 0; ///< Maximum spill weight evicted. - EvictionCost(): BrokenHints(0), MaxWeight(0) {} + EvictionCost() = default; bool isMax() const { return BrokenHints == ~0u; } @@ -282,8 +313,7 @@ class RAGreedy : public MachineFunctionPass, // Set B[i] = C for every live bundle where B[i] was NoCand. unsigned getBundles(SmallVectorImpl<unsigned> &B, unsigned C) { unsigned Count = 0; - for (int i = LiveBundles.find_first(); i >= 0; - i = LiveBundles.find_next(i)) + for (unsigned i : LiveBundles.set_bits()) if (B[i] == NoCand) { B[i] = C; Count++; @@ -411,15 +441,32 @@ private: /// Its currently assigned register. /// In case of a physical register Reg == PhysReg. unsigned PhysReg; + HintInfo(BlockFrequency Freq, unsigned Reg, unsigned PhysReg) : Freq(Freq), Reg(Reg), PhysReg(PhysReg) {} }; - typedef SmallVector<HintInfo, 4> HintsInfo; + using HintsInfo = SmallVector<HintInfo, 4>; + BlockFrequency getBrokenHintFreq(const HintsInfo &, unsigned); void collectHintInfo(unsigned, HintsInfo &); bool isUnusedCalleeSavedReg(unsigned PhysReg) const; + + /// Compute and report the number of spills and reloads for a loop. + void reportNumberOfSplillsReloads(MachineLoop *L, unsigned &Reloads, + unsigned &FoldedReloads, unsigned &Spills, + unsigned &FoldedSpills); + + /// Report the number of spills and reloads for each loop. + void reportNumberOfSplillsReloads() { + for (MachineLoop *L : *Loops) { + unsigned Reloads, FoldedReloads, Spills, FoldedSpills; + reportNumberOfSplillsReloads(L, Reloads, FoldedReloads, Spills, + FoldedSpills); + } + } }; + } // end anonymous namespace char RAGreedy::ID = 0; @@ -439,6 +486,7 @@ INITIALIZE_PASS_DEPENDENCY(VirtRegMap) INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) INITIALIZE_PASS_DEPENDENCY(EdgeBundles) INITIALIZE_PASS_DEPENDENCY(SpillPlacement) +INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass) INITIALIZE_PASS_END(RAGreedy, "greedy", "Greedy Register Allocator", false, false) @@ -458,7 +506,6 @@ const char *const RAGreedy::StageName[] = { // This helps stabilize decisions based on float comparisons. const float Hysteresis = (2007 / 2048.0f); // 0.97998046875 - FunctionPass* llvm::createGreedyRegisterAllocator() { return new RAGreedy(); } @@ -490,10 +537,10 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved<LiveRegMatrix>(); AU.addRequired<EdgeBundles>(); AU.addRequired<SpillPlacement>(); + AU.addRequired<MachineOptimizationRemarkEmitterPass>(); MachineFunctionPass::getAnalysisUsage(AU); } - //===----------------------------------------------------------------------===// // LiveRangeEdit delegate methods //===----------------------------------------------------------------------===// @@ -616,7 +663,6 @@ LiveInterval *RAGreedy::dequeue(PQueue &CurQueue) { return LI; } - //===----------------------------------------------------------------------===// // Direct Assignment //===----------------------------------------------------------------------===// @@ -664,7 +710,6 @@ unsigned RAGreedy::tryAssign(LiveInterval &VirtReg, return CheapReg ? CheapReg : PhysReg; } - //===----------------------------------------------------------------------===// // Interference eviction //===----------------------------------------------------------------------===// @@ -679,7 +724,7 @@ unsigned RAGreedy::canReassign(LiveInterval &VirtReg, unsigned PrevReg) { MCRegUnitIterator Units(PhysReg, TRI); for (; Units.isValid(); ++Units) { // Instantiate a "subquery", not to be confused with the Queries array. - LiveIntervalUnion::Query subQ(&VirtReg, &Matrix->getLiveUnions()[*Units]); + LiveIntervalUnion::Query subQ(VirtReg, Matrix->getLiveUnions()[*Units]); if (subQ.checkInterference()) break; } @@ -830,7 +875,11 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg, SmallVector<LiveInterval*, 8> Intfs; for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units); - assert(Q.seenAllInterferences() && "Didn't check all interfererences."); + // We usually have the interfering VRegs cached so collectInterferingVRegs() + // should be fast, we may need to recalculate if when different physregs + // overlap the same register unit so we had different SubRanges queried + // against it. + Q.collectInterferingVRegs(); ArrayRef<LiveInterval*> IVR = Q.interferingVRegs(); Intfs.append(IVR.begin(), IVR.end()); } @@ -932,7 +981,6 @@ unsigned RAGreedy::tryEvict(LiveInterval &VirtReg, return BestPhys; } - //===----------------------------------------------------------------------===// // Region Splitting //===----------------------------------------------------------------------===// @@ -1003,7 +1051,6 @@ bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf, return SpillPlacer->scanActiveBundles(); } - /// addThroughConstraints - Add constraints and links to SpillPlacer from the /// live-through blocks in Blocks. void RAGreedy::addThroughConstraints(InterferenceCache::Cursor Intf, @@ -1061,7 +1108,7 @@ void RAGreedy::growRegion(GlobalSplitCandidate &Cand) { unsigned Visited = 0; #endif - for (;;) { + while (true) { ArrayRef<unsigned> NewBundles = SpillPlacer->getRecentPositive(); // Find new through blocks in the periphery of PrefRegBundles. for (int i = 0, e = NewBundles.size(); i != e; ++i) { @@ -1139,9 +1186,8 @@ bool RAGreedy::calcCompactRegion(GlobalSplitCandidate &Cand) { } DEBUG({ - for (int i = Cand.LiveBundles.find_first(); i>=0; - i = Cand.LiveBundles.find_next(i)) - dbgs() << " EB#" << i; + for (int i : Cand.LiveBundles.set_bits()) + dbgs() << " EB#" << i; dbgs() << ".\n"; }); return true; @@ -1176,8 +1222,8 @@ BlockFrequency RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand) { for (unsigned i = 0; i != UseBlocks.size(); ++i) { const SplitAnalysis::BlockInfo &BI = UseBlocks[i]; SpillPlacement::BlockConstraint &BC = SplitConstraints[i]; - bool RegIn = LiveBundles[Bundles->getBundle(BC.Number, 0)]; - bool RegOut = LiveBundles[Bundles->getBundle(BC.Number, 1)]; + bool RegIn = LiveBundles[Bundles->getBundle(BC.Number, false)]; + bool RegOut = LiveBundles[Bundles->getBundle(BC.Number, true)]; unsigned Ins = 0; if (BI.LiveIn) @@ -1190,8 +1236,8 @@ BlockFrequency RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand) { for (unsigned i = 0, e = Cand.ActiveBlocks.size(); i != e; ++i) { unsigned Number = Cand.ActiveBlocks[i]; - bool RegIn = LiveBundles[Bundles->getBundle(Number, 0)]; - bool RegOut = LiveBundles[Bundles->getBundle(Number, 1)]; + bool RegIn = LiveBundles[Bundles->getBundle(Number, false)]; + bool RegOut = LiveBundles[Bundles->getBundle(Number, true)]; if (!RegIn && !RegOut) continue; if (RegIn && RegOut) { @@ -1243,7 +1289,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit, unsigned IntvIn = 0, IntvOut = 0; SlotIndex IntfIn, IntfOut; if (BI.LiveIn) { - unsigned CandIn = BundleCand[Bundles->getBundle(Number, 0)]; + unsigned CandIn = BundleCand[Bundles->getBundle(Number, false)]; if (CandIn != NoCand) { GlobalSplitCandidate &Cand = GlobalCand[CandIn]; IntvIn = Cand.IntvIdx; @@ -1252,7 +1298,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit, } } if (BI.LiveOut) { - unsigned CandOut = BundleCand[Bundles->getBundle(Number, 1)]; + unsigned CandOut = BundleCand[Bundles->getBundle(Number, true)]; if (CandOut != NoCand) { GlobalSplitCandidate &Cand = GlobalCand[CandOut]; IntvOut = Cand.IntvIdx; @@ -1292,7 +1338,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit, unsigned IntvIn = 0, IntvOut = 0; SlotIndex IntfIn, IntfOut; - unsigned CandIn = BundleCand[Bundles->getBundle(Number, 0)]; + unsigned CandIn = BundleCand[Bundles->getBundle(Number, false)]; if (CandIn != NoCand) { GlobalSplitCandidate &Cand = GlobalCand[CandIn]; IntvIn = Cand.IntvIdx; @@ -1300,7 +1346,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit, IntfIn = Cand.Intf.first(); } - unsigned CandOut = BundleCand[Bundles->getBundle(Number, 1)]; + unsigned CandOut = BundleCand[Bundles->getBundle(Number, true)]; if (CandOut != NoCand) { GlobalSplitCandidate &Cand = GlobalCand[CandOut]; IntvOut = Cand.IntvIdx; @@ -1459,8 +1505,7 @@ unsigned RAGreedy::calculateRegionSplitCost(LiveInterval &VirtReg, DEBUG({ dbgs() << ", total = "; MBFI->printBlockFreq(dbgs(), Cost) << " with bundles"; - for (int i = Cand.LiveBundles.find_first(); i>=0; - i = Cand.LiveBundles.find_next(i)) + for (int i : Cand.LiveBundles.set_bits()) dbgs() << " EB#" << i; dbgs() << ".\n"; }); @@ -1513,7 +1558,6 @@ unsigned RAGreedy::doRegionSplit(LiveInterval &VirtReg, unsigned BestCand, return 0; } - //===----------------------------------------------------------------------===// // Per-Block Splitting //===----------------------------------------------------------------------===// @@ -1560,7 +1604,6 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order, return 0; } - //===----------------------------------------------------------------------===// // Per-Instruction Splitting //===----------------------------------------------------------------------===// @@ -1644,12 +1687,10 @@ RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order, return 0; } - //===----------------------------------------------------------------------===// // Local Splitting //===----------------------------------------------------------------------===// - /// calcGapWeights - Compute the maximum spill weight that needs to be evicted /// in order to use PhysReg between two entries in SA->UseSlots. /// @@ -1720,7 +1761,7 @@ void RAGreedy::calcGapWeights(unsigned PhysReg, break; for (; Gap != NumGaps; ++Gap) { - GapWeight[Gap] = llvm::huge_valf; + GapWeight[Gap] = huge_valf; if (Uses[Gap+1].getBaseIndex() >= I->end) break; } @@ -1826,7 +1867,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order, // Remove any gaps with regmask clobbers. if (Matrix->checkRegMaskInterference(VirtReg, PhysReg)) for (unsigned i = 0, e = RegMaskGaps.size(); i != e; ++i) - GapWeight[RegMaskGaps[i]] = llvm::huge_valf; + GapWeight[RegMaskGaps[i]] = huge_valf; // Try to find the best sequence of gaps to close. // The new spill weight must be larger than any gap interference. @@ -1838,7 +1879,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order, // It is the spill weight that needs to be evicted. float MaxGap = GapWeight[0]; - for (;;) { + while (true) { // Live before/after split? const bool LiveBefore = SplitBefore != 0 || BI.LiveIn; const bool LiveAfter = SplitAfter != NumGaps || BI.LiveOut; @@ -1861,7 +1902,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order, // Legally, without causing looping? bool Legal = !ProgressRequired || NewGaps < NumGaps; - if (Legal && MaxGap < llvm::huge_valf) { + if (Legal && MaxGap < huge_valf) { // Estimate the new spill weight. Each instruction reads or writes the // register. Conservatively assume there are no read-modify-write // instructions. @@ -2417,7 +2458,7 @@ void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) { do { Reg = RecoloringCandidates.pop_back_val(); - // We cannot recolor physcal register. + // We cannot recolor physical register. if (TargetRegisterInfo::isPhysicalRegister(Reg)) continue; @@ -2581,7 +2622,7 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg, } // If we couldn't allocate a register from spilling, there is probably some - // invalid inline assembly. The base class wil report it. + // invalid inline assembly. The base class will report it. if (Stage >= RS_Done || !VirtReg.isSpillable()) return tryLastChanceRecoloring(VirtReg, Order, NewVRegs, FixedRegisters, Depth); @@ -2611,6 +2652,70 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg, return 0; } +void RAGreedy::reportNumberOfSplillsReloads(MachineLoop *L, unsigned &Reloads, + unsigned &FoldedReloads, + unsigned &Spills, + unsigned &FoldedSpills) { + Reloads = 0; + FoldedReloads = 0; + Spills = 0; + FoldedSpills = 0; + + // Sum up the spill and reloads in subloops. + for (MachineLoop *SubLoop : *L) { + unsigned SubReloads; + unsigned SubFoldedReloads; + unsigned SubSpills; + unsigned SubFoldedSpills; + + reportNumberOfSplillsReloads(SubLoop, SubReloads, SubFoldedReloads, + SubSpills, SubFoldedSpills); + Reloads += SubReloads; + FoldedReloads += SubFoldedReloads; + Spills += SubSpills; + FoldedSpills += SubFoldedSpills; + } + + const MachineFrameInfo &MFI = MF->getFrameInfo(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + int FI; + + for (MachineBasicBlock *MBB : L->getBlocks()) + // Handle blocks that were not included in subloops. + if (Loops->getLoopFor(MBB) == L) + for (MachineInstr &MI : *MBB) { + const MachineMemOperand *MMO; + + if (TII->isLoadFromStackSlot(MI, FI) && MFI.isSpillSlotObjectIndex(FI)) + ++Reloads; + else if (TII->hasLoadFromStackSlot(MI, MMO, FI) && + MFI.isSpillSlotObjectIndex(FI)) + ++FoldedReloads; + else if (TII->isStoreToStackSlot(MI, FI) && + MFI.isSpillSlotObjectIndex(FI)) + ++Spills; + else if (TII->hasStoreToStackSlot(MI, MMO, FI) && + MFI.isSpillSlotObjectIndex(FI)) + ++FoldedSpills; + } + + if (Reloads || FoldedReloads || Spills || FoldedSpills) { + using namespace ore; + + MachineOptimizationRemarkMissed R(DEBUG_TYPE, "LoopSpillReload", + L->getStartLoc(), L->getHeader()); + if (Spills) + R << NV("NumSpills", Spills) << " spills "; + if (FoldedSpills) + R << NV("NumFoldedSpills", FoldedSpills) << " folded spills "; + if (Reloads) + R << NV("NumReloads", Reloads) << " reloads "; + if (FoldedReloads) + R << NV("NumFoldedReloads", FoldedReloads) << " folded reloads "; + ORE->emit(R << "generated in loop"); + } +} + bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { DEBUG(dbgs() << "********** GREEDY REGISTER ALLOCATION **********\n" << "********** Function: " << mf.getName() << '\n'); @@ -2633,6 +2738,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { Indexes = &getAnalysis<SlotIndexes>(); MBFI = &getAnalysis<MachineBlockFrequencyInfo>(); DomTree = &getAnalysis<MachineDominatorTree>(); + ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE(); SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM)); Loops = &getAnalysis<MachineLoopInfo>(); Bundles = &getAnalysis<EdgeBundles>(); @@ -2658,6 +2764,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { allocatePhysRegs(); tryHintsRecoloring(); postOptimization(); + reportNumberOfSplillsReloads(); releaseMemory(); return true; diff --git a/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp b/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp index 101b30b..9778103 100644 --- a/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp +++ b/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp @@ -1,4 +1,4 @@ -//===------ RegAllocPBQP.cpp ---- PBQP Register Allocator -------*- C++ -*-===// +//===- RegAllocPBQP.cpp ---- PBQP Register Allocator ----------------------===// // // The LLVM Compiler Infrastructure // @@ -32,30 +32,59 @@ #include "llvm/CodeGen/RegAllocPBQP.h" #include "RegisterCoalescer.h" #include "Spiller.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/CalcSpillWeights.h" +#include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/LiveRangeEdit.h" #include "llvm/CodeGen/LiveStackAnalysis.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PBQP/Graph.h" +#include "llvm/CodeGen/PBQP/Math.h" +#include "llvm/CodeGen/PBQP/Solution.h" +#include "llvm/CodeGen/PBQPRAConstraint.h" #include "llvm/CodeGen/RegAllocRegistry.h" +#include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/IR/Function.h" #include "llvm/IR/Module.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Printable.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstddef> #include <limits> +#include <map> #include <memory> #include <queue> #include <set> #include <sstream> +#include <string> +#include <system_error> +#include <tuple> +#include <utility> #include <vector> using namespace llvm; @@ -86,7 +115,6 @@ namespace { /// Programming problems. class RegAllocPBQP : public MachineFunctionPass { public: - static char ID; /// Construct a PBQP register allocator. @@ -113,14 +141,13 @@ public: } private: - - typedef std::map<const LiveInterval*, unsigned> LI2NodeMap; - typedef std::vector<const LiveInterval*> Node2LIMap; - typedef std::vector<unsigned> AllowedSet; - typedef std::vector<AllowedSet> AllowedSetMap; - typedef std::pair<unsigned, unsigned> RegPair; - typedef std::map<RegPair, PBQP::PBQPNum> CoalesceMap; - typedef std::set<unsigned> RegSet; + using LI2NodeMap = std::map<const LiveInterval *, unsigned>; + using Node2LIMap = std::vector<const LiveInterval *>; + using AllowedSet = std::vector<unsigned>; + using AllowedSetMap = std::vector<AllowedSet>; + using RegPair = std::pair<unsigned, unsigned>; + using CoalesceMap = std::map<RegPair, PBQP::PBQPNum>; + using RegSet = std::set<unsigned>; char *customPassID; @@ -187,13 +214,12 @@ public: /// @brief Add interference edges between overlapping vregs. class Interference : public PBQPRAConstraint { private: - - typedef const PBQP::RegAlloc::AllowedRegVector* AllowedRegVecPtr; - typedef std::pair<AllowedRegVecPtr, AllowedRegVecPtr> IKey; - typedef DenseMap<IKey, PBQPRAGraph::MatrixPtr> IMatrixCache; - typedef DenseSet<IKey> DisjointAllowedRegsCache; - typedef std::pair<PBQP::GraphBase::NodeId, PBQP::GraphBase::NodeId> IEdgeKey; - typedef DenseSet<IEdgeKey> IEdgeCache; + using AllowedRegVecPtr = const PBQP::RegAlloc::AllowedRegVector *; + using IKey = std::pair<AllowedRegVecPtr, AllowedRegVecPtr>; + using IMatrixCache = DenseMap<IKey, PBQPRAGraph::MatrixPtr>; + using DisjointAllowedRegsCache = DenseSet<IKey>; + using IEdgeKey = std::pair<PBQP::GraphBase::NodeId, PBQP::GraphBase::NodeId>; + using IEdgeCache = DenseSet<IEdgeKey>; bool haveDisjointAllowedRegs(const PBQPRAGraph &G, PBQPRAGraph::NodeId NId, PBQPRAGraph::NodeId MId, @@ -228,8 +254,8 @@ private: // for the fast interference graph construction algorithm. The last is there // to save us from looking up node ids via the VRegToNode map in the graph // metadata. - typedef std::tuple<LiveInterval*, size_t, PBQP::GraphBase::NodeId> - IntervalInfo; + using IntervalInfo = + std::tuple<LiveInterval*, size_t, PBQP::GraphBase::NodeId>; static SlotIndex getStartPoint(const IntervalInfo &I) { return std::get<0>(I)->segments[std::get<1>(I)].start; @@ -276,7 +302,6 @@ private: } public: - void apply(PBQPRAGraph &G) override { // The following is loosely based on the linear scan algorithm introduced in // "Linear Scan Register Allocation" by Poletto and Sarkar. This version @@ -297,9 +322,10 @@ public: // Cache known disjoint allowed registers pairs DisjointAllowedRegsCache D; - typedef std::set<IntervalInfo, decltype(&lowestEndPoint)> IntervalSet; - typedef std::priority_queue<IntervalInfo, std::vector<IntervalInfo>, - decltype(&lowestStartPoint)> IntervalQueue; + using IntervalSet = std::set<IntervalInfo, decltype(&lowestEndPoint)>; + using IntervalQueue = + std::priority_queue<IntervalInfo, std::vector<IntervalInfo>, + decltype(&lowestStartPoint)>; IntervalSet Active(lowestEndPoint); IntervalQueue Inactive(lowestStartPoint); @@ -363,7 +389,6 @@ public: } private: - // Create an Interference edge and add it to the graph, unless it is // a null matrix, meaning the nodes' allowed registers do not have any // interference. This case occurs frequently between integer and floating @@ -372,7 +397,6 @@ private: bool createInterferenceEdge(PBQPRAGraph &G, PBQPRAGraph::NodeId NId, PBQPRAGraph::NodeId MId, IMatrixCache &C) { - const TargetRegisterInfo &TRI = *G.getMetadata().MF.getSubtarget().getRegisterInfo(); const auto &NRegs = G.getNodeMetadata(NId).getAllowedRegs(); @@ -409,7 +433,6 @@ private: } }; - class Coalescing : public PBQPRAConstraint { public: void apply(PBQPRAGraph &G) override { @@ -421,7 +444,6 @@ public: // gives the Ok. for (const auto &MBB : MF) { for (const auto &MI : MBB) { - // Skip not-coalescable or already coalesced copies. if (!CP.setRegisters(&MI) || CP.getSrcReg() == CP.getDstReg()) continue; @@ -479,7 +501,6 @@ public: } private: - void addVirtRegCoalesce( PBQPRAGraph::RawMatrix &CostMat, const PBQPRAGraph::NodeMetadata::AllowedRegVector &Allowed1, @@ -496,14 +517,15 @@ private: } } } - }; -} // End anonymous namespace. +} // end anonymous namespace // Out-of-line destructor/anchor for PBQPRAConstraint. -PBQPRAConstraint::~PBQPRAConstraint() {} +PBQPRAConstraint::~PBQPRAConstraint() = default; + void PBQPRAConstraint::anchor() {} + void PBQPRAConstraintList::anchor() {} void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const { @@ -554,7 +576,7 @@ void RegAllocPBQP::findVRegIntervalsToAlloc(const MachineFunction &MF, static bool isACalleeSavedRegister(unsigned reg, const TargetRegisterInfo &TRI, const MachineFunction &MF) { - const MCPhysReg *CSR = TRI.getCalleeSavedRegs(&MF); + const MCPhysReg *CSR = MF.getRegInfo().getCalleeSavedRegs(); for (unsigned i = 0; CSR[i] != 0; ++i) if (TRI.regsOverlap(reg, CSR[i])) return true; @@ -639,7 +661,6 @@ void RegAllocPBQP::spillVReg(unsigned VReg, SmallVectorImpl<unsigned> &NewIntervals, MachineFunction &MF, LiveIntervals &LIS, VirtRegMap &VRM, Spiller &VRegSpiller) { - VRegsToAlloc.erase(VReg); LiveRangeEdit LRE(&LIS.getInterval(VReg), NewIntervals, MF, LIS, &VRM, nullptr, &DeadRemats); @@ -717,7 +738,15 @@ void RegAllocPBQP::finalizeAlloc(MachineFunction &MF, if (PReg == 0) { const TargetRegisterClass &RC = *MRI.getRegClass(LI.reg); - PReg = RC.getRawAllocationOrder(MF).front(); + const ArrayRef<MCPhysReg> RawPRegOrder = RC.getRawAllocationOrder(MF); + for (unsigned CandidateReg : RawPRegOrder) { + if (!VRM.getRegInfo().isReserved(CandidateReg)) { + PReg = CandidateReg; + break; + } + } + assert(PReg && + "No un-reserved physical registers in this register class"); } VRM.assignVirt2Phys(LI.reg, PReg); @@ -777,7 +806,6 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) { // If there are non-empty intervals allocate them using pbqp. if (!VRegsToAlloc.empty()) { - const TargetSubtargetInfo &Subtarget = MF.getSubtarget(); std::unique_ptr<PBQPRAConstraintList> ConstraintsRoot = llvm::make_unique<PBQPRAConstraintList>(); @@ -840,7 +868,8 @@ static Printable PrintNodeInfo(PBQP::RegAlloc::PBQPRAGraph::NodeId NId, }); } -void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const { for (auto NId : nodeIds()) { const Vector &Costs = getNodeCosts(NId); assert(Costs.getLength() != 0 && "Empty vector in graph."); @@ -861,7 +890,10 @@ void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const { } } -LLVM_DUMP_METHOD void PBQP::RegAlloc::PBQPRAGraph::dump() const { dump(dbgs()); } +LLVM_DUMP_METHOD void PBQP::RegAlloc::PBQPRAGraph::dump() const { + dump(dbgs()); +} +#endif void PBQP::RegAlloc::PBQPRAGraph::printDot(raw_ostream &OS) const { OS << "graph {\n"; @@ -892,5 +924,3 @@ FunctionPass *llvm::createPBQPRegisterAllocator(char *customPassID) { FunctionPass* llvm::createDefaultPBQPRegisterAllocator() { return createPBQPRegisterAllocator(); } - -#undef DEBUG_TYPE diff --git a/contrib/llvm/lib/CodeGen/RegUsageInfoCollector.cpp b/contrib/llvm/lib/CodeGen/RegUsageInfoCollector.cpp index ece44c2..855aa37 100644 --- a/contrib/llvm/lib/CodeGen/RegUsageInfoCollector.cpp +++ b/contrib/llvm/lib/CodeGen/RegUsageInfoCollector.cpp @@ -103,9 +103,27 @@ bool RegUsageInfoCollector::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "Clobbered Registers: "); - for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg) - if (MRI->isPhysRegModified(PReg, true)) - RegMask[PReg / 32] &= ~(1u << PReg % 32); + const BitVector &UsedPhysRegsMask = MRI->getUsedPhysRegsMask(); + auto SetRegAsDefined = [&RegMask] (unsigned Reg) { + RegMask[Reg / 32] &= ~(1u << Reg % 32); + }; + // Scan all the physical registers. When a register is defined in the current + // function set it and all the aliasing registers as defined in the regmask. + for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg) { + // If a register is in the UsedPhysRegsMask set then mark it as defined. + // All it's aliases will also be in the set, so we can skip setting + // as defined all the aliases here. + if (UsedPhysRegsMask.test(PReg)) { + SetRegAsDefined(PReg); + continue; + } + // If a register is defined by an instruction mark it as defined together + // with all it's aliases. + if (!MRI->def_empty(PReg)) { + for (MCRegAliasIterator AI(PReg, TRI, true); AI.isValid(); ++AI) + SetRegAsDefined(*AI); + } + } if (!TargetFrameLowering::isSafeForNoCSROpt(F)) { const uint32_t *CallPreservedMask = diff --git a/contrib/llvm/lib/CodeGen/RegisterClassInfo.cpp b/contrib/llvm/lib/CodeGen/RegisterClassInfo.cpp index 178fa18..956dec3 100644 --- a/contrib/llvm/lib/CodeGen/RegisterClassInfo.cpp +++ b/contrib/llvm/lib/CodeGen/RegisterClassInfo.cpp @@ -1,4 +1,4 @@ -//===-- RegisterClassInfo.cpp - Dynamic Register Class Info ---------------===// +//===- RegisterClassInfo.cpp - Dynamic Register Class Info ----------------===// // // The LLVM Compiler Infrastructure // @@ -15,11 +15,21 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> using namespace llvm; @@ -29,8 +39,7 @@ static cl::opt<unsigned> StressRA("stress-regalloc", cl::Hidden, cl::init(0), cl::value_desc("N"), cl::desc("Limit all regclasses to N registers")); -RegisterClassInfo::RegisterClassInfo() - : Tag(0), MF(nullptr), TRI(nullptr), CalleeSaved(nullptr) {} +RegisterClassInfo::RegisterClassInfo() = default; void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) { bool Update = false; @@ -48,18 +57,20 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) { // Does this MF have different CSRs? assert(TRI && "no register info set"); - const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF); - if (Update || CSR != CalleeSaved) { - // Build a CSRNum map. Every CSR alias gets an entry pointing to the last + + // Get the callee saved registers. + const MCPhysReg *CSR = MF->getRegInfo().getCalleeSavedRegs(); + if (Update || CSR != CalleeSavedRegs) { + // Build a CSRAlias map. Every CSR alias saves the last // overlapping CSR. - CSRNum.clear(); - CSRNum.resize(TRI->getNumRegs(), 0); - for (unsigned N = 0; unsigned Reg = CSR[N]; ++N) - for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) - CSRNum[*AI] = N + 1; // 0 means no CSR, 1 means CalleeSaved[0], ... + CalleeSavedAliases.resize(TRI->getNumRegs(), 0); + for (const MCPhysReg *I = CSR; *I; ++I) + for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) + CalleeSavedAliases[*AI] = *I; + Update = true; } - CalleeSaved = CSR; + CalleeSavedRegs = CSR; // Different reserved registers? const BitVector &RR = MF->getRegInfo().getReservedRegs(); @@ -103,7 +114,7 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const { unsigned Cost = TRI->getCostPerUse(PhysReg); MinCost = std::min(MinCost, Cost); - if (CSRNum[PhysReg]) + if (CalleeSavedAliases[PhysReg]) // PhysReg aliases a CSR, save it for later. CSRAlias.push_back(PhysReg); else { @@ -114,7 +125,7 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const { } } RCI.NumRegs = N + CSRAlias.size(); - assert (RCI.NumRegs <= NumRegs && "Allocation order larger than regclass"); + assert(RCI.NumRegs <= NumRegs && "Allocation order larger than regclass"); // CSR aliases go after the volatile registers, preserve the target's order. for (unsigned i = 0, e = CSRAlias.size(); i != e; ++i) { @@ -156,9 +167,8 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const { unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const { const TargetRegisterClass *RC = nullptr; unsigned NumRCUnits = 0; - for (TargetRegisterInfo::regclass_iterator - RI = TRI->regclass_begin(), RE = TRI->regclass_end(); RI != RE; ++RI) { - const int *PSetID = TRI->getRegClassPressureSets(*RI); + for (const TargetRegisterClass *C : TRI->regclasses()) { + const int *PSetID = TRI->getRegClassPressureSets(C); for (; *PSetID != -1; ++PSetID) { if ((unsigned)*PSetID == Idx) break; @@ -168,9 +178,9 @@ unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const { // Found a register class that counts against this pressure set. // For efficiency, only compute the set order for the largest set. - unsigned NUnits = TRI->getRegClassWeight(*RI).WeightLimit; + unsigned NUnits = TRI->getRegClassWeight(C).WeightLimit; if (!RC || NUnits > NumRCUnits) { - RC = *RI; + RC = C; NumRCUnits = NUnits; } } diff --git a/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp b/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp index 4bb3c22..a67d07b 100644 --- a/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/LiveRangeEdit.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" @@ -189,6 +190,9 @@ namespace { /// This returns true if an interval was modified. bool removeCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI); + /// We found a copy which can be moved to its less frequent predecessor. + bool removePartialRedundancy(const CoalescerPair &CP, MachineInstr &CopyMI); + /// If the source of a copy is defined by a /// trivial computation, replace the copy by rematerialize the definition. bool reMaterializeTrivialDef(const CoalescerPair &CP, MachineInstr *CopyMI, @@ -811,42 +815,14 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP, VNInfo *ASubValNo = SA.getVNInfoAt(AIdx); assert(ASubValNo != nullptr); - LaneBitmask AMask = SA.LaneMask; - for (LiveInterval::SubRange &SB : IntB.subranges()) { - LaneBitmask BMask = SB.LaneMask; - LaneBitmask Common = BMask & AMask; - if (Common.none()) - continue; - - DEBUG( dbgs() << "\t\tCopy_Merge " << PrintLaneMask(BMask) - << " into " << PrintLaneMask(Common) << '\n'); - LaneBitmask BRest = BMask & ~AMask; - LiveInterval::SubRange *CommonRange; - if (BRest.any()) { - SB.LaneMask = BRest; - DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(BRest) - << '\n'); - // Duplicate SubRange for newly merged common stuff. - CommonRange = IntB.createSubRangeFrom(Allocator, Common, SB); - } else { - // We van reuse the L SubRange. - SB.LaneMask = Common; - CommonRange = &SB; - } - LiveRange RangeCopy(SB, Allocator); - - VNInfo *BSubValNo = CommonRange->getVNInfoAt(CopyIdx); - assert(BSubValNo->def == CopyIdx); - BSubValNo->def = ASubValNo->def; - addSegmentsWithValNo(*CommonRange, BSubValNo, SA, ASubValNo); - AMask &= ~BMask; - } - if (AMask.any()) { - DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(AMask) << '\n'); - LiveRange *NewRange = IntB.createSubRange(Allocator, AMask); - VNInfo *BSubValNo = NewRange->getNextValue(CopyIdx, Allocator); - addSegmentsWithValNo(*NewRange, BSubValNo, SA, ASubValNo); - } + IntB.refineSubRanges(Allocator, SA.LaneMask, + [&Allocator,&SA,CopyIdx,ASubValNo](LiveInterval::SubRange &SR) { + VNInfo *BSubValNo = SR.empty() + ? SR.getNextValue(CopyIdx, Allocator) + : SR.getVNInfoAt(CopyIdx); + assert(BSubValNo != nullptr); + addSegmentsWithValNo(SR, BSubValNo, SA, ASubValNo); + }); } } @@ -861,6 +837,191 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP, return true; } +/// For copy B = A in BB2, if A is defined by A = B in BB0 which is a +/// predecessor of BB2, and if B is not redefined on the way from A = B +/// in BB2 to B = A in BB2, B = A in BB2 is partially redundant if the +/// execution goes through the path from BB0 to BB2. We may move B = A +/// to the predecessor without such reversed copy. +/// So we will transform the program from: +/// BB0: +/// A = B; BB1: +/// ... ... +/// / \ / +/// BB2: +/// ... +/// B = A; +/// +/// to: +/// +/// BB0: BB1: +/// A = B; ... +/// ... B = A; +/// / \ / +/// BB2: +/// ... +/// +/// A special case is when BB0 and BB2 are the same BB which is the only +/// BB in a loop: +/// BB1: +/// ... +/// BB0/BB2: ---- +/// B = A; | +/// ... | +/// A = B; | +/// |------- +/// | +/// We may hoist B = A from BB0/BB2 to BB1. +/// +/// The major preconditions for correctness to remove such partial +/// redundancy include: +/// 1. A in B = A in BB2 is defined by a PHI in BB2, and one operand of +/// the PHI is defined by the reversed copy A = B in BB0. +/// 2. No B is referenced from the start of BB2 to B = A. +/// 3. No B is defined from A = B to the end of BB0. +/// 4. BB1 has only one successor. +/// +/// 2 and 4 implicitly ensure B is not live at the end of BB1. +/// 4 guarantees BB2 is hotter than BB1, so we can only move a copy to a +/// colder place, which not only prevent endless loop, but also make sure +/// the movement of copy is beneficial. +bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP, + MachineInstr &CopyMI) { + assert(!CP.isPhys()); + if (!CopyMI.isFullCopy()) + return false; + + MachineBasicBlock &MBB = *CopyMI.getParent(); + if (MBB.isEHPad()) + return false; + + if (MBB.pred_size() != 2) + return false; + + LiveInterval &IntA = + LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg()); + LiveInterval &IntB = + LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg()); + + // A is defined by PHI at the entry of MBB. + SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot(true); + VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx); + assert(AValNo && !AValNo->isUnused() && "COPY source not live"); + if (!AValNo->isPHIDef()) + return false; + + // No B is referenced before CopyMI in MBB. + if (IntB.overlaps(LIS->getMBBStartIdx(&MBB), CopyIdx)) + return false; + + // MBB has two predecessors: one contains A = B so no copy will be inserted + // for it. The other one will have a copy moved from MBB. + bool FoundReverseCopy = false; + MachineBasicBlock *CopyLeftBB = nullptr; + for (MachineBasicBlock *Pred : MBB.predecessors()) { + VNInfo *PVal = IntA.getVNInfoBefore(LIS->getMBBEndIdx(Pred)); + MachineInstr *DefMI = LIS->getInstructionFromIndex(PVal->def); + if (!DefMI || !DefMI->isFullCopy()) { + CopyLeftBB = Pred; + continue; + } + // Check DefMI is a reverse copy and it is in BB Pred. + if (DefMI->getOperand(0).getReg() != IntA.reg || + DefMI->getOperand(1).getReg() != IntB.reg || + DefMI->getParent() != Pred) { + CopyLeftBB = Pred; + continue; + } + // If there is any other def of B after DefMI and before the end of Pred, + // we need to keep the copy of B = A at the end of Pred if we remove + // B = A from MBB. + bool ValB_Changed = false; + for (auto VNI : IntB.valnos) { + if (VNI->isUnused()) + continue; + if (PVal->def < VNI->def && VNI->def < LIS->getMBBEndIdx(Pred)) { + ValB_Changed = true; + break; + } + } + if (ValB_Changed) { + CopyLeftBB = Pred; + continue; + } + FoundReverseCopy = true; + } + + // If no reverse copy is found in predecessors, nothing to do. + if (!FoundReverseCopy) + return false; + + // If CopyLeftBB is nullptr, it means every predecessor of MBB contains + // reverse copy, CopyMI can be removed trivially if only IntA/IntB is updated. + // If CopyLeftBB is not nullptr, move CopyMI from MBB to CopyLeftBB and + // update IntA/IntB. + // + // If CopyLeftBB is not nullptr, ensure CopyLeftBB has a single succ so + // MBB is hotter than CopyLeftBB. + if (CopyLeftBB && CopyLeftBB->succ_size() > 1) + return false; + + // Now ok to move copy. + if (CopyLeftBB) { + DEBUG(dbgs() << "\tremovePartialRedundancy: Move the copy to BB#" + << CopyLeftBB->getNumber() << '\t' << CopyMI); + + // Insert new copy to CopyLeftBB. + auto InsPos = CopyLeftBB->getFirstTerminator(); + MachineInstr *NewCopyMI = BuildMI(*CopyLeftBB, InsPos, CopyMI.getDebugLoc(), + TII->get(TargetOpcode::COPY), IntB.reg) + .addReg(IntA.reg); + SlotIndex NewCopyIdx = + LIS->InsertMachineInstrInMaps(*NewCopyMI).getRegSlot(); + IntB.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator()); + for (LiveInterval::SubRange &SR : IntB.subranges()) + SR.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator()); + + // If the newly created Instruction has an address of an instruction that was + // deleted before (object recycled by the allocator) it needs to be removed from + // the deleted list. + ErasedInstrs.erase(NewCopyMI); + } else { + DEBUG(dbgs() << "\tremovePartialRedundancy: Remove the copy from BB#" + << MBB.getNumber() << '\t' << CopyMI); + } + + // Remove CopyMI. + // Note: This is fine to remove the copy before updating the live-ranges. + // While updating the live-ranges, we only look at slot indices and + // never go back to the instruction. + LIS->RemoveMachineInstrFromMaps(CopyMI); + // Mark instructions as deleted. + ErasedInstrs.insert(&CopyMI); + CopyMI.eraseFromParent(); + + // Update the liveness. + SmallVector<SlotIndex, 8> EndPoints; + VNInfo *BValNo = IntB.Query(CopyIdx).valueOutOrDead(); + LIS->pruneValue(*static_cast<LiveRange *>(&IntB), CopyIdx.getRegSlot(), + &EndPoints); + BValNo->markUnused(); + // Extend IntB to the EndPoints of its original live interval. + LIS->extendToIndices(IntB, EndPoints); + + // Now, do the same for its subranges. + for (LiveInterval::SubRange &SR : IntB.subranges()) { + EndPoints.clear(); + VNInfo *BValNo = SR.Query(CopyIdx).valueOutOrDead(); + assert(BValNo && "All sublanes should be live"); + LIS->pruneValue(SR, CopyIdx.getRegSlot(), &EndPoints); + BValNo->markUnused(); + LIS->extendToIndices(SR, EndPoints); + } + + // Finally, update the live-range of IntA. + shrinkToUses(&IntA); + return true; +} + /// Returns true if @p MI defines the full vreg @p Reg, as opposed to just /// defining a subregister. static bool definesFullReg(const MachineInstr &MI, unsigned Reg) { @@ -1066,6 +1227,34 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, SR->createDeadDef(DefIndex, Alloc); } } + + // Make sure that the subrange for resultant undef is removed + // For example: + // vreg1:sub1<def,read-undef> = LOAD CONSTANT 1 + // vreg2<def> = COPY vreg1 + // ==> + // vreg2:sub1<def, read-undef> = LOAD CONSTANT 1 + // ; Correct but need to remove the subrange for vreg2:sub0 + // ; as it is now undef + if (NewIdx != 0 && DstInt.hasSubRanges()) { + // The affected subregister segments can be removed. + SlotIndex CurrIdx = LIS->getInstructionIndex(NewMI); + LaneBitmask DstMask = TRI->getSubRegIndexLaneMask(NewIdx); + bool UpdatedSubRanges = false; + for (LiveInterval::SubRange &SR : DstInt.subranges()) { + if ((SR.LaneMask & DstMask).none()) { + DEBUG(dbgs() << "Removing undefined SubRange " + << PrintLaneMask(SR.LaneMask) << " : " << SR << "\n"); + // VNI is in ValNo - remove any segments in this SubRange that have this ValNo + if (VNInfo *RmValNo = SR.getVNInfoAt(CurrIdx.getRegSlot())) { + SR.removeValNo(RmValNo); + UpdatedSubRanges = true; + } + } + } + if (UpdatedSubRanges) + DstInt.removeEmptySubRanges(); + } } else if (NewMI.getOperand(0).getReg() != CopyDstReg) { // The New instruction may be defining a sub-register of what's actually // been asked for. If so it must implicitly define the whole thing. @@ -1290,7 +1479,7 @@ void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg, // If SrcReg wasn't read, it may still be the case that DstReg is live-in // because SrcReg is a sub-register. - if (DstInt && !Reads && SubIdx) + if (DstInt && !Reads && SubIdx && !UseMI->isDebugValue()) Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI)); // Replace SrcReg with DstReg in all UseMI operands. @@ -1486,6 +1675,12 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) { } } + // Try and see if we can partially eliminate the copy by moving the copy to + // its predecessor. + if (!CP.isPartial() && !CP.isPhys()) + if (removePartialRedundancy(CP, *CopyMI)) + return true; + // Otherwise, we are unable to join the intervals. DEBUG(dbgs() << "\tInterference!\n"); Again = true; // May be possible to coalesce later. @@ -1583,6 +1778,14 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) { return false; } } + + // We must also check for overlaps with regmask clobbers. + BitVector RegMaskUsable; + if (LIS->checkRegMaskInterference(RHS, RegMaskUsable) && + !RegMaskUsable.test(DstReg)) { + DEBUG(dbgs() << "\t\tRegMask interference\n"); + return false; + } } // Skip any value computations, we are not adding new values to the @@ -1636,14 +1839,6 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) { DEBUG(dbgs() << "\t\tInterference (read): " << *MI); return false; } - - // We must also check for clobbers caused by regmasks. - for (const auto &MO : MI->operands()) { - if (MO.isRegMask() && MO.clobbersPhysReg(DstReg)) { - DEBUG(dbgs() << "\t\tInterference (regmask clobber): " << *MI); - return false; - } - } } } @@ -2506,11 +2701,17 @@ void JoinVals::pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask) { // Look for values being erased. bool DidPrune = false; for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) { - if (Vals[i].Resolution != CR_Erase) + // We should trigger in all cases in which eraseInstrs() does something. + // match what eraseInstrs() is doing, print a message so + if (Vals[i].Resolution != CR_Erase && + (Vals[i].Resolution != CR_Keep || !Vals[i].ErasableImplicitDef || + !Vals[i].Pruned)) continue; // Check subranges at the point where the copy will be removed. SlotIndex Def = LR.getValNumInfo(i)->def; + // Print message so mismatches with eraseInstrs() can be diagnosed. + DEBUG(dbgs() << "\t\tExpecting instruction removal at " << Def << '\n'); for (LiveInterval::SubRange &S : LI.subranges()) { LiveQueryResult Q = S.Query(Def); @@ -2738,39 +2939,16 @@ void RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI, LaneBitmask LaneMask, CoalescerPair &CP) { BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator(); - for (LiveInterval::SubRange &R : LI.subranges()) { - LaneBitmask RMask = R.LaneMask; - // LaneMask of subregisters common to subrange R and ToMerge. - LaneBitmask Common = RMask & LaneMask; - // There is nothing to do without common subregs. - if (Common.none()) - continue; - - DEBUG(dbgs() << "\t\tCopy+Merge " << PrintLaneMask(RMask) << " into " - << PrintLaneMask(Common) << '\n'); - // LaneMask of subregisters contained in the R range but not in ToMerge, - // they have to split into their own subrange. - LaneBitmask LRest = RMask & ~LaneMask; - LiveInterval::SubRange *CommonRange; - if (LRest.any()) { - R.LaneMask = LRest; - DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(LRest) << '\n'); - // Duplicate SubRange for newly merged common stuff. - CommonRange = LI.createSubRangeFrom(Allocator, Common, R); + LI.refineSubRanges(Allocator, LaneMask, + [this,&Allocator,&ToMerge,&CP](LiveInterval::SubRange &SR) { + if (SR.empty()) { + SR.assign(ToMerge, Allocator); } else { - // Reuse the existing range. - R.LaneMask = Common; - CommonRange = &R; + // joinSubRegRange() destroys the merged range, so we need a copy. + LiveRange RangeCopy(ToMerge, Allocator); + joinSubRegRanges(SR, RangeCopy, SR.LaneMask, CP); } - LiveRange RangeCopy(ToMerge, Allocator); - joinSubRegRanges(*CommonRange, RangeCopy, Common, CP); - LaneMask &= ~RMask; - } - - if (LaneMask.any()) { - DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(LaneMask) << '\n'); - LI.createSubRangeFrom(Allocator, LaneMask, ToMerge); - } + }); } bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) { @@ -2952,7 +3130,7 @@ copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList) { continue; // Skip instruction pointers that have already been erased, for example by // dead code elimination. - if (ErasedInstrs.erase(CurrList[i])) { + if (ErasedInstrs.count(CurrList[i])) { CurrList[i] = nullptr; continue; } @@ -3077,7 +3255,7 @@ RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) { CurrList(WorkList.begin() + PrevSize, WorkList.end()); if (copyCoalesceWorkList(CurrList)) WorkList.erase(std::remove(WorkList.begin() + PrevSize, WorkList.end(), - (MachineInstr*)nullptr), WorkList.end()); + nullptr), WorkList.end()); } void RegisterCoalescer::coalesceLocals() { diff --git a/contrib/llvm/lib/CodeGen/RegisterPressure.cpp b/contrib/llvm/lib/CodeGen/RegisterPressure.cpp index fc84aeb..88e0a3b 100644 --- a/contrib/llvm/lib/CodeGen/RegisterPressure.cpp +++ b/contrib/llvm/lib/CodeGen/RegisterPressure.cpp @@ -1,4 +1,4 @@ -//===-- RegisterPressure.cpp - Dynamic Register Pressure ------------------===// +//===- RegisterPressure.cpp - Dynamic Register Pressure -------------------===// // // The LLVM Compiler Infrastructure // @@ -13,12 +13,36 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/RegisterPressure.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBundle.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <cstdlib> +#include <cstring> +#include <iterator> +#include <limits> +#include <utility> +#include <vector> using namespace llvm; @@ -52,6 +76,7 @@ static void decreaseSetPressure(std::vector<unsigned> &CurrSetPressure, } } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void llvm::dumpRegSetPressure(ArrayRef<unsigned> SetPressure, const TargetRegisterInfo *TRI) { @@ -97,6 +122,7 @@ void RegPressureTracker::dump() const { P.dump(TRI); } +LLVM_DUMP_METHOD void PressureDiff::dump(const TargetRegisterInfo &TRI) const { const char *sep = ""; for (const PressureChange &Change : *this) { @@ -108,6 +134,7 @@ void PressureDiff::dump(const TargetRegisterInfo &TRI) const { } dbgs() << '\n'; } +#endif void RegPressureTracker::increaseRegPressure(unsigned RegUnit, LaneBitmask PreviousMask, @@ -264,7 +291,6 @@ bool RegPressureTracker::isBottomClosed() const { MachineBasicBlock::const_iterator()); } - SlotIndex RegPressureTracker::getCurrSlot() const { MachineBasicBlock::const_iterator IdxPos = skipDebugInstructionsForward(CurrPos, MBB->end()); @@ -328,7 +354,7 @@ void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) { static LaneBitmask getRegLanes(ArrayRef<RegisterMaskPair> RegUnits, unsigned RegUnit) { - auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { + auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { return Other.RegUnit == RegUnit; }); if (I == RegUnits.end()) @@ -340,7 +366,7 @@ static void addRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits, RegisterMaskPair Pair) { unsigned RegUnit = Pair.RegUnit; assert(Pair.LaneMask.any()); - auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { + auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { return Other.RegUnit == RegUnit; }); if (I == RegUnits.end()) { @@ -352,7 +378,7 @@ static void addRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits, static void setRegZero(SmallVectorImpl<RegisterMaskPair> &RegUnits, unsigned RegUnit) { - auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { + auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { return Other.RegUnit == RegUnit; }); if (I == RegUnits.end()) { @@ -366,7 +392,7 @@ static void removeRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits, RegisterMaskPair Pair) { unsigned RegUnit = Pair.RegUnit; assert(Pair.LaneMask.any()); - auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { + auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { return Other.RegUnit == RegUnit; }); if (I != RegUnits.end()) { @@ -423,6 +449,8 @@ namespace { /// /// FIXME: always ignore tied opers class RegisterOperandsCollector { + friend class llvm::RegisterOperands; + RegisterOperands &RegOpers; const TargetRegisterInfo &TRI; const MachineRegisterInfo &MRI; @@ -517,11 +545,9 @@ class RegisterOperandsCollector { addRegLanes(RegUnits, RegisterMaskPair(*Units, LaneBitmask::getAll())); } } - - friend class llvm::RegisterOperands; }; -} // namespace +} // end anonymous namespace void RegisterOperands::collect(const MachineInstr &MI, const TargetRegisterInfo &TRI, @@ -674,7 +700,7 @@ void RegPressureTracker::discoverLiveInOrOut(RegisterMaskPair Pair, assert(Pair.LaneMask.any()); unsigned RegUnit = Pair.RegUnit; - auto I = find_if(LiveInOrOut, [RegUnit](const RegisterMaskPair &Other) { + auto I = llvm::find_if(LiveInOrOut, [RegUnit](const RegisterMaskPair &Other) { return Other.RegUnit == RegUnit; }); LaneBitmask PrevMask; @@ -772,9 +798,10 @@ void RegPressureTracker::recede(const RegisterOperands &RegOpers, if (!TrackLaneMasks) { addRegLanes(*LiveUses, RegisterMaskPair(Reg, NewMask)); } else { - auto I = find_if(*LiveUses, [Reg](const RegisterMaskPair Other) { - return Other.RegUnit == Reg; - }); + auto I = + llvm::find_if(*LiveUses, [Reg](const RegisterMaskPair Other) { + return Other.RegUnit == Reg; + }); bool IsRedef = I != LiveUses->end(); if (IsRedef) { // ignore re-defs here... @@ -1154,7 +1181,7 @@ getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff, if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == PSetID) { int CritInc = (int)MNew - (int)CriticalPSets[CritIdx].getUnitInc(); - if (CritInc > 0 && CritInc <= INT16_MAX) { + if (CritInc > 0 && CritInc <= std::numeric_limits<int16_t>::max()) { Delta.CriticalMax = PressureChange(PSetID); Delta.CriticalMax.setUnitInc(CritInc); } diff --git a/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp b/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp index fdf741f..fc5105a 100644 --- a/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp +++ b/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp @@ -1,4 +1,4 @@ -//===-- RegisterScavenging.cpp - Machine register scavenging --------------===// +//===- RegisterScavenging.cpp - Machine register scavenging ---------------===// // // The LLVM Compiler Infrastructure // @@ -16,27 +16,39 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/RegisterScavenging.h" + +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/PassSupport.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <iterator> +#include <limits> +#include <string> + using namespace llvm; #define DEBUG_TYPE "reg-scavenging" +STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged"); + void RegScavenger::setRegUsed(unsigned Reg, LaneBitmask LaneMask) { - for (MCRegUnitMaskIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) { - LaneBitmask UnitMask = (*RUI).second; - if (UnitMask.none() || (LaneMask & UnitMask).any()) - RegUnitsAvailable.reset((*RUI).first); - } + LiveUnits.addRegMasked(Reg, LaneMask); } void RegScavenger::init(MachineBasicBlock &MBB) { @@ -44,6 +56,7 @@ void RegScavenger::init(MachineBasicBlock &MBB) { TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); MRI = &MF.getRegInfo(); + LiveUnits.init(*TRI); assert((NumRegUnits == 0 || NumRegUnits == TRI->getNumRegUnits()) && "Target changed?"); @@ -51,45 +64,28 @@ void RegScavenger::init(MachineBasicBlock &MBB) { // Self-initialize. if (!this->MBB) { NumRegUnits = TRI->getNumRegUnits(); - RegUnitsAvailable.resize(NumRegUnits); KillRegUnits.resize(NumRegUnits); DefRegUnits.resize(NumRegUnits); TmpRegUnits.resize(NumRegUnits); } this->MBB = &MBB; - for (SmallVectorImpl<ScavengedInfo>::iterator I = Scavenged.begin(), - IE = Scavenged.end(); I != IE; ++I) { - I->Reg = 0; - I->Restore = nullptr; + for (ScavengedInfo &SI : Scavenged) { + SI.Reg = 0; + SI.Restore = nullptr; } - // All register units start out unused. - RegUnitsAvailable.set(); - - // Pristine CSRs are not available. - BitVector PR = MF.getFrameInfo().getPristineRegs(MF); - for (int I = PR.find_first(); I>0; I = PR.find_next(I)) - setRegUsed(I); - Tracking = false; } -void RegScavenger::setLiveInsUsed(const MachineBasicBlock &MBB) { - for (const auto &LI : MBB.liveins()) - setRegUsed(LI.PhysReg, LI.LaneMask); -} - void RegScavenger::enterBasicBlock(MachineBasicBlock &MBB) { init(MBB); - setLiveInsUsed(MBB); + LiveUnits.addLiveIns(MBB); } void RegScavenger::enterBasicBlockEnd(MachineBasicBlock &MBB) { init(MBB); - // Merge live-ins of successors to get live-outs. - for (const MachineBasicBlock *Succ : MBB.successors()) - setLiveInsUsed(*Succ); + LiveUnits.addLiveOuts(MBB); // Move internal iterator at the last instruction of the block. if (MBB.begin() != MBB.end()) { @@ -263,34 +259,13 @@ void RegScavenger::backward() { assert(Tracking && "Must be tracking to determine kills and defs"); const MachineInstr &MI = *MBBI; - // Defined or clobbered registers are available now. - for (const MachineOperand &MO : MI.operands()) { - if (MO.isRegMask()) { - for (unsigned RU = 0, RUEnd = TRI->getNumRegUnits(); RU != RUEnd; - ++RU) { - for (MCRegUnitRootIterator RURI(RU, TRI); RURI.isValid(); ++RURI) { - if (MO.clobbersPhysReg(*RURI)) { - RegUnitsAvailable.set(RU); - break; - } - } - } - } else if (MO.isReg() && MO.isDef()) { - unsigned Reg = MO.getReg(); - if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg) || - isReserved(Reg)) - continue; - addRegUnits(RegUnitsAvailable, Reg); - } - } - // Mark read registers as unavailable. - for (const MachineOperand &MO : MI.uses()) { - if (MO.isReg() && MO.readsReg()) { - unsigned Reg = MO.getReg(); - if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg) || - isReserved(Reg)) - continue; - removeRegUnits(RegUnitsAvailable, Reg); + LiveUnits.stepBackward(MI); + + // Expire scavenge spill frameindex uses. + for (ScavengedInfo &I : Scavenged) { + if (I.Restore == &MI) { + I.Reg = 0; + I.Restore = nullptr; } } @@ -302,12 +277,9 @@ void RegScavenger::backward() { } bool RegScavenger::isRegUsed(unsigned Reg, bool includeReserved) const { - if (includeReserved && isReserved(Reg)) - return true; - for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) - if (!RegUnitsAvailable.test(*RUI)) - return true; - return false; + if (isReserved(Reg)) + return includeReserved; + return !LiveUnits.available(Reg); } unsigned RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const { @@ -393,6 +365,86 @@ unsigned RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI, return Survivor; } +/// Given the bitvector \p Available of free register units at position +/// \p From. Search backwards to find a register that is part of \p +/// Candidates and not used/clobbered until the point \p To. If there is +/// multiple candidates continue searching and pick the one that is not used/ +/// clobbered for the longest time. +/// Returns the register and the earliest position we know it to be free or +/// the position MBB.end() if no register is available. +static std::pair<MCPhysReg, MachineBasicBlock::iterator> +findSurvivorBackwards(const MachineRegisterInfo &MRI, + MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, + const LiveRegUnits &LiveOut, ArrayRef<MCPhysReg> AllocationOrder, + bool RestoreAfter) { + bool FoundTo = false; + MCPhysReg Survivor = 0; + MachineBasicBlock::iterator Pos; + MachineBasicBlock &MBB = *From->getParent(); + unsigned InstrLimit = 25; + unsigned InstrCountDown = InstrLimit; + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); + LiveRegUnits Used(TRI); + + for (MachineBasicBlock::iterator I = From;; --I) { + const MachineInstr &MI = *I; + + Used.accumulate(MI); + + if (I == To) { + // See if one of the registers in RC wasn't used so far. + for (MCPhysReg Reg : AllocationOrder) { + if (!MRI.isReserved(Reg) && Used.available(Reg) && + LiveOut.available(Reg)) + return std::make_pair(Reg, MBB.end()); + } + // Otherwise we will continue up to InstrLimit instructions to find + // the register which is not defined/used for the longest time. + FoundTo = true; + Pos = To; + // Note: It was fine so far to start our search at From, however now that + // we have to spill, and can only place the restore after From then + // add the regs used/defed by std::next(From) to the set. + if (RestoreAfter) + Used.accumulate(*std::next(From)); + } + if (FoundTo) { + if (Survivor == 0 || !Used.available(Survivor)) { + MCPhysReg AvilableReg = 0; + for (MCPhysReg Reg : AllocationOrder) { + if (!MRI.isReserved(Reg) && Used.available(Reg)) { + AvilableReg = Reg; + break; + } + } + if (AvilableReg == 0) + break; + Survivor = AvilableReg; + } + if (--InstrCountDown == 0) + break; + + // Keep searching when we find a vreg since the spilled register will + // be usefull for this other vreg as well later. + bool FoundVReg = false; + for (const MachineOperand &MO : MI.operands()) { + if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) { + FoundVReg = true; + break; + } + } + if (FoundVReg) { + InstrCountDown = InstrLimit; + Pos = I; + } + if (I == MBB.begin()) + break; + } + } + + return std::make_pair(Survivor, Pos); +} + static unsigned getFrameIndexOperandNum(MachineInstr &MI) { unsigned i = 0; while (!MI.getOperand(i).isFI()) { @@ -402,46 +454,18 @@ static unsigned getFrameIndexOperandNum(MachineInstr &MI) { return i; } -unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC, - MachineBasicBlock::iterator I, - int SPAdj) { - MachineInstr &MI = *I; - const MachineFunction &MF = *MI.getParent()->getParent(); - // Consider all allocatable registers in the register class initially - BitVector Candidates = TRI->getAllocatableSet(MF, RC); - - // Exclude all the registers being used by the instruction. - for (const MachineOperand &MO : MI.operands()) { - if (MO.isReg() && MO.getReg() != 0 && !(MO.isUse() && MO.isUndef()) && - !TargetRegisterInfo::isVirtualRegister(MO.getReg())) - for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI) - Candidates.reset(*AI); - } - - // Try to find a register that's unused if there is one, as then we won't - // have to spill. - BitVector Available = getRegsAvailable(RC); - Available &= Candidates; - if (Available.any()) - Candidates = Available; - - // Find the register whose use is furthest away. - MachineBasicBlock::iterator UseMI; - unsigned SReg = findSurvivorReg(I, Candidates, 25, UseMI); - - // If we found an unused register there is no reason to spill it. - if (!isRegUsed(SReg)) { - DEBUG(dbgs() << "Scavenged register: " << TRI->getName(SReg) << "\n"); - return SReg; - } - +RegScavenger::ScavengedInfo & +RegScavenger::spill(unsigned Reg, const TargetRegisterClass &RC, int SPAdj, + MachineBasicBlock::iterator Before, + MachineBasicBlock::iterator &UseMI) { // Find an available scavenging slot with size and alignment matching // the requirements of the class RC. + const MachineFunction &MF = *Before->getParent()->getParent(); const MachineFrameInfo &MFI = MF.getFrameInfo(); - unsigned NeedSize = RC->getSize(); - unsigned NeedAlign = RC->getAlignment(); + unsigned NeedSize = TRI->getSpillSize(RC); + unsigned NeedAlign = TRI->getSpillAlignment(RC); - unsigned SI = Scavenged.size(), Diff = UINT_MAX; + unsigned SI = Scavenged.size(), Diff = std::numeric_limits<unsigned>::max(); int FIB = MFI.getObjectIndexBegin(), FIE = MFI.getObjectIndexEnd(); for (unsigned I = 0; I < Scavenged.size(); ++I) { if (Scavenged[I].Reg != 0) @@ -474,42 +498,303 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC, } // Avoid infinite regress - Scavenged[SI].Reg = SReg; + Scavenged[SI].Reg = Reg; // If the target knows how to save/restore the register, let it do so; // otherwise, use the emergency stack spill slot. - if (!TRI->saveScavengerRegister(*MBB, I, UseMI, RC, SReg)) { - // Spill the scavenged register before I. + if (!TRI->saveScavengerRegister(*MBB, Before, UseMI, &RC, Reg)) { + // Spill the scavenged register before \p Before. int FI = Scavenged[SI].FrameIndex; if (FI < FIB || FI >= FIE) { std::string Msg = std::string("Error while trying to spill ") + - TRI->getName(SReg) + " from class " + TRI->getRegClassName(RC) + + TRI->getName(Reg) + " from class " + TRI->getRegClassName(&RC) + ": Cannot scavenge register without an emergency spill slot!"; report_fatal_error(Msg.c_str()); } - TII->storeRegToStackSlot(*MBB, I, SReg, true, Scavenged[SI].FrameIndex, - RC, TRI); - MachineBasicBlock::iterator II = std::prev(I); + TII->storeRegToStackSlot(*MBB, Before, Reg, true, Scavenged[SI].FrameIndex, + &RC, TRI); + MachineBasicBlock::iterator II = std::prev(Before); unsigned FIOperandNum = getFrameIndexOperandNum(*II); TRI->eliminateFrameIndex(II, SPAdj, FIOperandNum, this); // Restore the scavenged register before its use (or first terminator). - TII->loadRegFromStackSlot(*MBB, UseMI, SReg, Scavenged[SI].FrameIndex, - RC, TRI); + TII->loadRegFromStackSlot(*MBB, UseMI, Reg, Scavenged[SI].FrameIndex, + &RC, TRI); II = std::prev(UseMI); FIOperandNum = getFrameIndexOperandNum(*II); TRI->eliminateFrameIndex(II, SPAdj, FIOperandNum, this); } + return Scavenged[SI]; +} - Scavenged[SI].Restore = &*std::prev(UseMI); +unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC, + MachineBasicBlock::iterator I, + int SPAdj) { + MachineInstr &MI = *I; + const MachineFunction &MF = *MI.getParent()->getParent(); + // Consider all allocatable registers in the register class initially + BitVector Candidates = TRI->getAllocatableSet(MF, RC); + + // Exclude all the registers being used by the instruction. + for (const MachineOperand &MO : MI.operands()) { + if (MO.isReg() && MO.getReg() != 0 && !(MO.isUse() && MO.isUndef()) && + !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI) + Candidates.reset(*AI); + } - // Doing this here leads to infinite regress. - // Scavenged[SI].Reg = SReg; + // Try to find a register that's unused if there is one, as then we won't + // have to spill. + BitVector Available = getRegsAvailable(RC); + Available &= Candidates; + if (Available.any()) + Candidates = Available; + + // Find the register whose use is furthest away. + MachineBasicBlock::iterator UseMI; + unsigned SReg = findSurvivorReg(I, Candidates, 25, UseMI); + + // If we found an unused register there is no reason to spill it. + if (!isRegUsed(SReg)) { + DEBUG(dbgs() << "Scavenged register: " << TRI->getName(SReg) << "\n"); + return SReg; + } + + ScavengedInfo &Scavenged = spill(SReg, *RC, SPAdj, I, UseMI); + Scavenged.Restore = &*std::prev(UseMI); DEBUG(dbgs() << "Scavenged register (with spill): " << TRI->getName(SReg) << "\n"); return SReg; } + +unsigned RegScavenger::scavengeRegisterBackwards(const TargetRegisterClass &RC, + MachineBasicBlock::iterator To, + bool RestoreAfter, int SPAdj) { + const MachineBasicBlock &MBB = *To->getParent(); + const MachineFunction &MF = *MBB.getParent(); + + // Find the register whose use is furthest away. + MachineBasicBlock::iterator UseMI; + ArrayRef<MCPhysReg> AllocationOrder = RC.getRawAllocationOrder(MF); + std::pair<MCPhysReg, MachineBasicBlock::iterator> P = + findSurvivorBackwards(*MRI, MBBI, To, LiveUnits, AllocationOrder, + RestoreAfter); + MCPhysReg Reg = P.first; + MachineBasicBlock::iterator SpillBefore = P.second; + assert(Reg != 0 && "No register left to scavenge!"); + // Found an available register? + if (SpillBefore != MBB.end()) { + MachineBasicBlock::iterator ReloadAfter = + RestoreAfter ? std::next(MBBI) : MBBI; + MachineBasicBlock::iterator ReloadBefore = std::next(ReloadAfter); + DEBUG(dbgs() << "Reload before: " << *ReloadBefore << '\n'); + ScavengedInfo &Scavenged = spill(Reg, RC, SPAdj, SpillBefore, ReloadBefore); + Scavenged.Restore = &*std::prev(SpillBefore); + LiveUnits.removeReg(Reg); + DEBUG(dbgs() << "Scavenged register with spill: " << PrintReg(Reg, TRI) + << " until " << *SpillBefore); + } else { + DEBUG(dbgs() << "Scavenged free register: " << PrintReg(Reg, TRI) << '\n'); + } + return Reg; +} + +/// Allocate a register for the virtual register \p VReg. The last use of +/// \p VReg is around the current position of the register scavenger \p RS. +/// \p ReserveAfter controls whether the scavenged register needs to be reserved +/// after the current instruction, otherwise it will only be reserved before the +/// current instruction. +static unsigned scavengeVReg(MachineRegisterInfo &MRI, RegScavenger &RS, + unsigned VReg, bool ReserveAfter) { + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); +#ifndef NDEBUG + // Verify that all definitions and uses are in the same basic block. + const MachineBasicBlock *CommonMBB = nullptr; + // Real definition for the reg, re-definitions are not considered. + const MachineInstr *RealDef = nullptr; + for (MachineOperand &MO : MRI.reg_nodbg_operands(VReg)) { + MachineBasicBlock *MBB = MO.getParent()->getParent(); + if (CommonMBB == nullptr) + CommonMBB = MBB; + assert(MBB == CommonMBB && "All defs+uses must be in the same basic block"); + if (MO.isDef()) { + const MachineInstr &MI = *MO.getParent(); + if (!MI.readsRegister(VReg, &TRI)) { + assert((!RealDef || RealDef == &MI) && + "Can have at most one definition which is not a redefinition"); + RealDef = &MI; + } + } + } + assert(RealDef != nullptr && "Must have at least 1 Def"); +#endif + + // We should only have one definition of the register. However to accommodate + // the requirements of two address code we also allow definitions in + // subsequent instructions provided they also read the register. That way + // we get a single contiguous lifetime. + // + // Definitions in MRI.def_begin() are unordered, search for the first. + MachineRegisterInfo::def_iterator FirstDef = + std::find_if(MRI.def_begin(VReg), MRI.def_end(), + [VReg, &TRI](const MachineOperand &MO) { + return !MO.getParent()->readsRegister(VReg, &TRI); + }); + assert(FirstDef != MRI.def_end() && + "Must have one definition that does not redefine vreg"); + MachineInstr &DefMI = *FirstDef->getParent(); + + // The register scavenger will report a free register inserting an emergency + // spill/reload if necessary. + int SPAdj = 0; + const TargetRegisterClass &RC = *MRI.getRegClass(VReg); + unsigned SReg = RS.scavengeRegisterBackwards(RC, DefMI.getIterator(), + ReserveAfter, SPAdj); + MRI.replaceRegWith(VReg, SReg); + ++NumScavengedRegs; + return SReg; +} + +/// Allocate (scavenge) vregs inside a single basic block. +/// Returns true if the target spill callback created new vregs and a 2nd pass +/// is necessary. +static bool scavengeFrameVirtualRegsInBlock(MachineRegisterInfo &MRI, + RegScavenger &RS, + MachineBasicBlock &MBB) { + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); + RS.enterBasicBlockEnd(MBB); + + unsigned InitialNumVirtRegs = MRI.getNumVirtRegs(); + bool NextInstructionReadsVReg = false; + for (MachineBasicBlock::iterator I = MBB.end(); I != MBB.begin(); ) { + --I; + // Move RegScavenger to the position between *I and *std::next(I). + RS.backward(I); + + // Look for unassigned vregs in the uses of *std::next(I). + if (NextInstructionReadsVReg) { + MachineBasicBlock::iterator N = std::next(I); + const MachineInstr &NMI = *N; + for (const MachineOperand &MO : NMI.operands()) { + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + // We only care about virtual registers and ignore virtual registers + // created by the target callbacks in the process (those will be handled + // in a scavenging round). + if (!TargetRegisterInfo::isVirtualRegister(Reg) || + TargetRegisterInfo::virtReg2Index(Reg) >= InitialNumVirtRegs) + continue; + if (!MO.readsReg()) + continue; + + unsigned SReg = scavengeVReg(MRI, RS, Reg, true); + N->addRegisterKilled(SReg, &TRI, false); + RS.setRegUsed(SReg); + } + } + + // Look for unassigned vregs in the defs of *I. + NextInstructionReadsVReg = false; + const MachineInstr &MI = *I; + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + // Only vregs, no newly created vregs (see above). + if (!TargetRegisterInfo::isVirtualRegister(Reg) || + TargetRegisterInfo::virtReg2Index(Reg) >= InitialNumVirtRegs) + continue; + // We have to look at all operands anyway so we can precalculate here + // whether there is a reading operand. This allows use to skip the use + // step in the next iteration if there was none. + assert(!MO.isInternalRead() && "Cannot assign inside bundles"); + assert((!MO.isUndef() || MO.isDef()) && "Cannot handle undef uses"); + if (MO.readsReg()) { + NextInstructionReadsVReg = true; + } + if (MO.isDef()) { + unsigned SReg = scavengeVReg(MRI, RS, Reg, false); + I->addRegisterDead(SReg, &TRI, false); + } + } + } +#ifndef NDEBUG + for (const MachineOperand &MO : MBB.front().operands()) { + if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + continue; + assert(!MO.isInternalRead() && "Cannot assign inside bundles"); + assert((!MO.isUndef() || MO.isDef()) && "Cannot handle undef uses"); + assert(!MO.readsReg() && "Vreg use in first instruction not allowed"); + } +#endif + + return MRI.getNumVirtRegs() != InitialNumVirtRegs; +} + +void llvm::scavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger &RS) { + // FIXME: Iterating over the instruction stream is unnecessary. We can simply + // iterate over the vreg use list, which at this point only contains machine + // operands for which eliminateFrameIndex need a new scratch reg. + MachineRegisterInfo &MRI = MF.getRegInfo(); + // Shortcut. + if (MRI.getNumVirtRegs() == 0) { + MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs); + return; + } + + // Run through the instructions and find any virtual registers. + for (MachineBasicBlock &MBB : MF) { + if (MBB.empty()) + continue; + + bool Again = scavengeFrameVirtualRegsInBlock(MRI, RS, MBB); + if (Again) { + DEBUG(dbgs() << "Warning: Required two scavenging passes for block " + << MBB.getName() << '\n'); + Again = scavengeFrameVirtualRegsInBlock(MRI, RS, MBB); + // The target required a 2nd run (because it created new vregs while + // spilling). Refuse to do another pass to keep compiletime in check. + if (Again) + report_fatal_error("Incomplete scavenging after 2nd pass"); + } + } + + MRI.clearVirtRegs(); + MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs); +} + +namespace { +/// This class runs register scavenging independ of the PrologEpilogInserter. +/// This is used in for testing. +class ScavengerTest : public MachineFunctionPass { +public: + static char ID; + ScavengerTest() : MachineFunctionPass(ID) {} + bool runOnMachineFunction(MachineFunction &MF) { + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const TargetFrameLowering &TFL = *STI.getFrameLowering(); + + RegScavenger RS; + // Let's hope that calling those outside of PrologEpilogueInserter works + // well enough to initialize the scavenger with some emergency spillslots + // for the target. + BitVector SavedRegs; + TFL.determineCalleeSaves(MF, SavedRegs, &RS); + TFL.processFunctionBeforeFrameFinalized(MF, &RS); + + // Let's scavenge the current function + scavengeFrameVirtualRegs(MF, RS); + return true; + } +}; +char ScavengerTest::ID; + +} // end anonymous namespace + +INITIALIZE_PASS(ScavengerTest, "scavenger-test", + "Scavenge virtual registers inside basic blocks", false, false) diff --git a/contrib/llvm/lib/CodeGen/RegisterUsageInfo.cpp b/contrib/llvm/lib/CodeGen/RegisterUsageInfo.cpp index 66f1966..30757f0 100644 --- a/contrib/llvm/lib/CodeGen/RegisterUsageInfo.cpp +++ b/contrib/llvm/lib/CodeGen/RegisterUsageInfo.cpp @@ -1,4 +1,4 @@ -//===- RegisterUsageInfo.cpp - Register Usage Informartion Storage --------===// +//===- RegisterUsageInfo.cpp - Register Usage Information Storage ---------===// // // The LLVM Compiler Infrastructure // @@ -12,11 +12,22 @@ /// //===----------------------------------------------------------------------===// +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/RegisterUsageInfo.h" #include "llvm/CodeGen/MachineOperand.h" +#include "llvm/IR/Function.h" #include "llvm/IR/Module.h" -#include "llvm/Support/Debug.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <utility> +#include <vector> using namespace llvm; @@ -27,7 +38,7 @@ static cl::opt<bool> DumpRegUsage( cl::desc("print register usage details collected for analysis.")); INITIALIZE_PASS(PhysicalRegisterUsageInfo, "reg-usage-info", - "Register Usage Informartion Stroage", false, true) + "Register Usage Information Storage", false, true) char PhysicalRegisterUsageInfo::ID = 0; @@ -63,7 +74,7 @@ PhysicalRegisterUsageInfo::getRegUsageInfo(const Function *FP) { void PhysicalRegisterUsageInfo::print(raw_ostream &OS, const Module *M) const { const TargetRegisterInfo *TRI; - typedef std::pair<const Function *, std::vector<uint32_t>> FuncPtrRegMaskPair; + using FuncPtrRegMaskPair = std::pair<const Function *, std::vector<uint32_t>>; SmallVector<const FuncPtrRegMaskPair *, 64> FPRMPairVector; diff --git a/contrib/llvm/lib/CodeGen/RenameIndependentSubregs.cpp b/contrib/llvm/lib/CodeGen/RenameIndependentSubregs.cpp index 2f7ee8b..bd5ecbd 100644 --- a/contrib/llvm/lib/CodeGen/RenameIndependentSubregs.cpp +++ b/contrib/llvm/lib/CodeGen/RenameIndependentSubregs.cpp @@ -32,10 +32,10 @@ #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/Target/TargetInstrInfo.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" using namespace llvm; @@ -112,11 +112,11 @@ char RenameIndependentSubregs::ID; char &llvm::RenameIndependentSubregsID = RenameIndependentSubregs::ID; -INITIALIZE_PASS_BEGIN(RenameIndependentSubregs, "rename-independent-subregs", +INITIALIZE_PASS_BEGIN(RenameIndependentSubregs, DEBUG_TYPE, "Rename Independent Subregisters", false, false) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_END(RenameIndependentSubregs, "rename-independent-subregs", +INITIALIZE_PASS_END(RenameIndependentSubregs, DEBUG_TYPE, "Rename Independent Subregisters", false, false) bool RenameIndependentSubregs::renameComponents(LiveInterval &LI) const { @@ -212,7 +212,7 @@ void RenameIndependentSubregs::rewriteOperands(const IntEqClasses &Classes, const SmallVectorImpl<SubRangeInfo> &SubRangeInfos, const SmallVectorImpl<LiveInterval*> &Intervals) const { const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo(); - unsigned Reg = Intervals[0]->reg;; + unsigned Reg = Intervals[0]->reg; for (MachineRegisterInfo::reg_nodbg_iterator I = MRI->reg_nodbg_begin(Reg), E = MRI->reg_nodbg_end(); I != E; ) { MachineOperand &MO = *I++; @@ -243,6 +243,15 @@ void RenameIndependentSubregs::rewriteOperands(const IntEqClasses &Classes, unsigned VReg = Intervals[ID]->reg; MO.setReg(VReg); + + if (MO.isTied() && Reg != VReg) { + /// Undef use operands are not tracked in the equivalence class but need + /// to be update if they are tied. + MO.getParent()->substituteRegister(Reg, VReg, 0, TRI); + + // substituteRegister breaks the iterator, so restart. + I = MRI->reg_nodbg_begin(Reg); + } } // TODO: We could attempt to recompute new register classes while visiting // the operands: Some of the split register may be fine with less constraint diff --git a/contrib/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp b/contrib/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp index 4519641..01b3db4 100644 --- a/contrib/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp +++ b/contrib/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp @@ -14,9 +14,9 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/Support/Debug.h" using namespace llvm; @@ -30,17 +30,23 @@ namespace { /// Tells whether or not this pass should emit a fallback /// diagnostic when it resets a function. bool EmitFallbackDiag; + /// Whether we should abort immediately instead of resetting the function. + bool AbortOnFailedISel; public: static char ID; // Pass identification, replacement for typeid - ResetMachineFunction(bool EmitFallbackDiag = false) - : MachineFunctionPass(ID), EmitFallbackDiag(EmitFallbackDiag) {} + ResetMachineFunction(bool EmitFallbackDiag = false, + bool AbortOnFailedISel = false) + : MachineFunctionPass(ID), EmitFallbackDiag(EmitFallbackDiag), + AbortOnFailedISel(AbortOnFailedISel) {} StringRef getPassName() const override { return "ResetMachineFunction"; } bool runOnMachineFunction(MachineFunction &MF) override { if (MF.getProperties().hasProperty( MachineFunctionProperties::Property::FailedISel)) { + if (AbortOnFailedISel) + report_fatal_error("Instruction selection failed"); DEBUG(dbgs() << "Reseting: " << MF.getName() << '\n'); ++NumFunctionsReset; MF.reset(); @@ -62,6 +68,7 @@ INITIALIZE_PASS(ResetMachineFunction, DEBUG_TYPE, "reset machine function if ISel failed", false, false) MachineFunctionPass * -llvm::createResetMachineFunctionPass(bool EmitFallbackDiag = false) { - return new ResetMachineFunction(EmitFallbackDiag); +llvm::createResetMachineFunctionPass(bool EmitFallbackDiag = false, + bool AbortOnFailedISel = false) { + return new ResetMachineFunction(EmitFallbackDiag, AbortOnFailedISel); } diff --git a/contrib/llvm/lib/CodeGen/SafeStack.cpp b/contrib/llvm/lib/CodeGen/SafeStack.cpp index 2b82df2..8584a9b 100644 --- a/contrib/llvm/lib/CodeGen/SafeStack.cpp +++ b/contrib/llvm/lib/CodeGen/SafeStack.cpp @@ -19,10 +19,12 @@ #include "SafeStackLayout.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/Triple.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/DataLayout.h" @@ -50,7 +52,7 @@ using namespace llvm; using namespace llvm::safestack; -#define DEBUG_TYPE "safestack" +#define DEBUG_TYPE "safe-stack" namespace llvm { @@ -92,11 +94,11 @@ public: /// determined statically), and the unsafe stack, which contains all /// local variables that are accessed in ways that we can't prove to /// be safe. -class SafeStack : public FunctionPass { - const TargetMachine *TM; - const TargetLoweringBase *TL; - const DataLayout *DL; - ScalarEvolution *SE; +class SafeStack { + Function &F; + const TargetLoweringBase &TL; + const DataLayout &DL; + ScalarEvolution &SE; Type *StackPtrTy; Type *IntPtrTy; @@ -171,33 +173,21 @@ class SafeStack : public FunctionPass { uint64_t AllocaSize); public: - static char ID; // Pass identification, replacement for typeid. - SafeStack(const TargetMachine *TM) - : FunctionPass(ID), TM(TM), TL(nullptr), DL(nullptr) { - initializeSafeStackPass(*PassRegistry::getPassRegistry()); - } - SafeStack() : SafeStack(nullptr) {} - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<ScalarEvolutionWrapperPass>(); - } - - bool doInitialization(Module &M) override { - DL = &M.getDataLayout(); - - StackPtrTy = Type::getInt8PtrTy(M.getContext()); - IntPtrTy = DL->getIntPtrType(M.getContext()); - Int32Ty = Type::getInt32Ty(M.getContext()); - Int8Ty = Type::getInt8Ty(M.getContext()); - - return false; - } - - bool runOnFunction(Function &F) override; -}; // class SafeStack + SafeStack(Function &F, const TargetLoweringBase &TL, const DataLayout &DL, + ScalarEvolution &SE) + : F(F), TL(TL), DL(DL), SE(SE), + StackPtrTy(Type::getInt8PtrTy(F.getContext())), + IntPtrTy(DL.getIntPtrType(F.getContext())), + Int32Ty(Type::getInt32Ty(F.getContext())), + Int8Ty(Type::getInt8Ty(F.getContext())) {} + + // Run the transformation on the associated function. + // Returns whether the function was changed. + bool run(); +}; uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) { - uint64_t Size = DL->getTypeAllocSize(AI->getAllocatedType()); + uint64_t Size = DL.getTypeAllocSize(AI->getAllocatedType()); if (AI->isArrayAllocation()) { auto C = dyn_cast<ConstantInt>(AI->getArraySize()); if (!C) @@ -209,11 +199,11 @@ uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) { bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize, const Value *AllocaPtr, uint64_t AllocaSize) { - AllocaOffsetRewriter Rewriter(*SE, AllocaPtr); - const SCEV *Expr = Rewriter.visit(SE->getSCEV(Addr)); + AllocaOffsetRewriter Rewriter(SE, AllocaPtr); + const SCEV *Expr = Rewriter.visit(SE.getSCEV(Addr)); - uint64_t BitWidth = SE->getTypeSizeInBits(Expr->getType()); - ConstantRange AccessStartRange = SE->getUnsignedRange(Expr); + uint64_t BitWidth = SE.getTypeSizeInBits(Expr->getType()); + ConstantRange AccessStartRange = SE.getUnsignedRange(Expr); ConstantRange SizeRange = ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AccessSize)); ConstantRange AccessRange = AccessStartRange.add(SizeRange); @@ -226,8 +216,8 @@ bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize, << *AllocaPtr << "\n" << " Access " << *Addr << "\n" << " SCEV " << *Expr - << " U: " << SE->getUnsignedRange(Expr) - << ", S: " << SE->getSignedRange(Expr) << "\n" + << " U: " << SE.getUnsignedRange(Expr) + << ", S: " << SE.getSignedRange(Expr) << "\n" << " Range " << AccessRange << "\n" << " AllocaRange " << AllocaRange << "\n" << " " << (Safe ? "safe" : "unsafe") << "\n"); @@ -266,7 +256,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) { switch (I->getOpcode()) { case Instruction::Load: { - if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getType()), AllocaPtr, + if (!IsAccessSafe(UI, DL.getTypeStoreSize(I->getType()), AllocaPtr, AllocaSize)) return false; break; @@ -282,7 +272,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) { return false; } - if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getOperand(0)->getType()), + if (!IsAccessSafe(UI, DL.getTypeStoreSize(I->getOperand(0)->getType()), AllocaPtr, AllocaSize)) return false; break; @@ -343,7 +333,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) { } Value *SafeStack::getStackGuard(IRBuilder<> &IRB, Function &F) { - Value *StackGuardVar = TL->getIRStackGuard(IRB); + Value *StackGuardVar = TL.getIRStackGuard(IRB); if (!StackGuardVar) StackGuardVar = F.getParent()->getOrInsertGlobal("__stack_chk_guard", StackPtrTy); @@ -390,7 +380,7 @@ void SafeStack::findInsts(Function &F, if (!Arg.hasByValAttr()) continue; uint64_t Size = - DL->getTypeStoreSize(Arg.getType()->getPointerElementType()); + DL.getTypeStoreSize(Arg.getType()->getPointerElementType()); if (IsSafeStackAlloca(&Arg, Size)) continue; @@ -451,7 +441,7 @@ void SafeStack::checkStackGuard(IRBuilder<> &IRB, Function &F, ReturnInst &RI, IRBuilder<> IRBFail(CheckTerm); // FIXME: respect -fsanitize-trap / -ftrap-function here? Constant *StackChkFail = F.getParent()->getOrInsertFunction( - "__stack_chk_fail", IRB.getVoidTy(), nullptr); + "__stack_chk_fail", IRB.getVoidTy()); IRBFail.CreateCall(StackChkFail, {}); } @@ -476,19 +466,19 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack( if (StackGuardSlot) { Type *Ty = StackGuardSlot->getAllocatedType(); unsigned Align = - std::max(DL->getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment()); + std::max(DL.getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment()); SSL.addObject(StackGuardSlot, getStaticAllocaAllocationSize(StackGuardSlot), Align, SSC.getFullLiveRange()); } for (Argument *Arg : ByValArguments) { Type *Ty = Arg->getType()->getPointerElementType(); - uint64_t Size = DL->getTypeStoreSize(Ty); + uint64_t Size = DL.getTypeStoreSize(Ty); if (Size == 0) Size = 1; // Don't create zero-sized stack objects. // Ensure the object is properly aligned. - unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty), + unsigned Align = std::max((unsigned)DL.getPrefTypeAlignment(Ty), Arg->getParamAlignment()); SSL.addObject(Arg, Size, Align, SSC.getFullLiveRange()); } @@ -501,7 +491,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack( // Ensure the object is properly aligned. unsigned Align = - std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI->getAlignment()); + std::max((unsigned)DL.getPrefTypeAlignment(Ty), AI->getAlignment()); SSL.addObject(AI, Size, Align, SSC.getLiveRange(AI)); } @@ -539,7 +529,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack( unsigned Offset = SSL.getObjectOffset(Arg); Type *Ty = Arg->getType()->getPointerElementType(); - uint64_t Size = DL->getTypeStoreSize(Ty); + uint64_t Size = DL.getTypeStoreSize(Ty); if (Size == 0) Size = 1; // Don't create zero-sized stack objects. @@ -550,7 +540,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack( // Replace alloc with the new location. replaceDbgDeclare(Arg, BasePointer, BasePointer->getNextNode(), DIB, - /*Deref=*/true, -Offset); + /*Deref=*/false, -Offset); Arg->replaceAllUsesWith(NewArg); IRB.SetInsertPoint(cast<Instruction>(NewArg)->getNextNode()); IRB.CreateMemCpy(Off, Arg, Size, Arg->getParamAlignment()); @@ -565,7 +555,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack( if (Size == 0) Size = 1; // Don't create zero-sized stack objects. - replaceDbgDeclareForAlloca(AI, BasePointer, DIB, /*Deref=*/true, -Offset); + replaceDbgDeclareForAlloca(AI, BasePointer, DIB, /*Deref=*/false, -Offset); replaceDbgValueForAlloca(AI, BasePointer, DIB, -Offset); // Replace uses of the alloca with the new location. @@ -630,7 +620,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack( ArraySize = IRB.CreateIntCast(ArraySize, IntPtrTy, false); Type *Ty = AI->getAllocatedType(); - uint64_t TySize = DL->getTypeAllocSize(Ty); + uint64_t TySize = DL.getTypeAllocSize(Ty); Value *Size = IRB.CreateMul(ArraySize, ConstantInt::get(IntPtrTy, TySize)); Value *SP = IRB.CreatePtrToInt(IRB.CreateLoad(UnsafeStackPtr), IntPtrTy); @@ -638,7 +628,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack( // Align the SP value to satisfy the AllocaInst, type and stack alignments. unsigned Align = std::max( - std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI->getAlignment()), + std::max((unsigned)DL.getPrefTypeAlignment(Ty), AI->getAlignment()), (unsigned)StackAlignment); assert(isPowerOf2_32(Align)); @@ -655,7 +645,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack( if (AI->hasName() && isa<Instruction>(NewAI)) NewAI->takeName(AI); - replaceDbgDeclareForAlloca(AI, NewAI, DIB, /*Deref=*/true); + replaceDbgDeclareForAlloca(AI, NewAI, DIB, /*Deref=*/false); AI->replaceAllUsesWith(NewAI); AI->eraseFromParent(); } @@ -685,25 +675,10 @@ void SafeStack::moveDynamicAllocasToUnsafeStack( } } -bool SafeStack::runOnFunction(Function &F) { - DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n"); - - if (!F.hasFnAttribute(Attribute::SafeStack)) { - DEBUG(dbgs() << "[SafeStack] safestack is not requested" - " for this function\n"); - return false; - } - - if (F.isDeclaration()) { - DEBUG(dbgs() << "[SafeStack] function definition" - " is not available\n"); - return false; - } - - if (!TM) - report_fatal_error("Target machine is required"); - TL = TM->getSubtargetImpl(F)->getTargetLowering(); - SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); +bool SafeStack::run() { + assert(F.hasFnAttribute(Attribute::SafeStack) && + "Can't run SafeStack on a function without the attribute"); + assert(!F.isDeclaration() && "Can't run SafeStack on a function declaration"); ++NumFunctions; @@ -736,7 +711,7 @@ bool SafeStack::runOnFunction(Function &F) { ++NumUnsafeStackRestorePointsFunctions; IRBuilder<> IRB(&F.front(), F.begin()->getFirstInsertionPt()); - UnsafeStackPtr = TL->getSafeStackPointerLocation(IRB); + UnsafeStackPtr = TL.getSafeStackPointerLocation(IRB); // Load the current stack pointer (we'll also use it as a base pointer). // FIXME: use a dedicated register for it ? @@ -788,14 +763,67 @@ bool SafeStack::runOnFunction(Function &F) { return true; } +class SafeStackLegacyPass : public FunctionPass { + const TargetMachine *TM; + +public: + static char ID; // Pass identification, replacement for typeid.. + SafeStackLegacyPass() : FunctionPass(ID), TM(nullptr) { + initializeSafeStackLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetPassConfig>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<AssumptionCacheTracker>(); + } + + bool runOnFunction(Function &F) override { + DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n"); + + if (!F.hasFnAttribute(Attribute::SafeStack)) { + DEBUG(dbgs() << "[SafeStack] safestack is not requested" + " for this function\n"); + return false; + } + + if (F.isDeclaration()) { + DEBUG(dbgs() << "[SafeStack] function definition" + " is not available\n"); + return false; + } + + TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); + auto *TL = TM->getSubtargetImpl(F)->getTargetLowering(); + if (!TL) + report_fatal_error("TargetLowering instance is required"); + + auto *DL = &F.getParent()->getDataLayout(); + auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto &ACT = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + + // Compute DT and LI only for functions that have the attribute. + // This is only useful because the legacy pass manager doesn't let us + // compute analyzes lazily. + // In the backend pipeline, nothing preserves DT before SafeStack, so we + // would otherwise always compute it wastefully, even if there is no + // function with the safestack attribute. + DominatorTree DT(F); + LoopInfo LI(DT); + + ScalarEvolution SE(F, TLI, ACT, DT, LI); + + return SafeStack(F, *TL, *DL, SE).run(); + } +}; + } // anonymous namespace -char SafeStack::ID = 0; -INITIALIZE_TM_PASS_BEGIN(SafeStack, "safe-stack", - "Safe Stack instrumentation pass", false, false) -INITIALIZE_TM_PASS_END(SafeStack, "safe-stack", - "Safe Stack instrumentation pass", false, false) +char SafeStackLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(SafeStackLegacyPass, DEBUG_TYPE, + "Safe Stack instrumentation pass", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(SafeStackLegacyPass, DEBUG_TYPE, + "Safe Stack instrumentation pass", false, false) -FunctionPass *llvm::createSafeStackPass(const llvm::TargetMachine *TM) { - return new SafeStack(TM); -} +FunctionPass *llvm::createSafeStackPass() { return new SafeStackLegacyPass(); } diff --git a/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp b/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp index 7fbeadd..21f2fa4 100644 --- a/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp +++ b/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp @@ -20,9 +20,10 @@ using namespace llvm::safestack; #define DEBUG_TYPE "safestackcoloring" +// Disabled by default due to PR32143. static cl::opt<bool> ClColoring("safe-stack-coloring", cl::desc("enable safe stack coloring"), - cl::Hidden, cl::init(true)); + cl::Hidden, cl::init(false)); const StackColoring::LiveRange &StackColoring::getLiveRange(AllocaInst *AI) { const auto IT = AllocaNumbering.find(AI); @@ -236,6 +237,7 @@ void StackColoring::calculateLiveIntervals() { } } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void StackColoring::dumpAllocas() { dbgs() << "Allocas:\n"; for (unsigned AllocaNo = 0; AllocaNo < NumAllocas; ++AllocaNo) @@ -262,6 +264,7 @@ LLVM_DUMP_METHOD void StackColoring::dumpLiveRanges() { dbgs() << " " << AllocaNo << ": " << Range << "\n"; } } +#endif void StackColoring::run() { DEBUG(dumpAllocas()); diff --git a/contrib/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp b/contrib/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp new file mode 100644 index 0000000..07b43a8 --- /dev/null +++ b/contrib/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp @@ -0,0 +1,656 @@ +//=== ScalarizeMaskedMemIntrin.cpp - Scalarize unsupported masked mem ===// +//=== instrinsics ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass replaces masked memory intrinsics - when unsupported by the target +// - with a chain of basic blocks, that deal with the elements one-by-one if the +// appropriate mask bit is set. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "scalarize-masked-mem-intrin" + +namespace { + +class ScalarizeMaskedMemIntrin : public FunctionPass { + const TargetTransformInfo *TTI; + +public: + static char ID; // Pass identification, replacement for typeid + explicit ScalarizeMaskedMemIntrin() : FunctionPass(ID), TTI(nullptr) { + initializeScalarizeMaskedMemIntrinPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override; + + StringRef getPassName() const override { + return "Scalarize Masked Memory Intrinsics"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetTransformInfoWrapperPass>(); + } + +private: + bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT); + bool optimizeCallInst(CallInst *CI, bool &ModifiedDT); +}; +} // namespace + +char ScalarizeMaskedMemIntrin::ID = 0; +INITIALIZE_PASS(ScalarizeMaskedMemIntrin, DEBUG_TYPE, + "Scalarize unsupported masked memory intrinsics", false, false) + +FunctionPass *llvm::createScalarizeMaskedMemIntrinPass() { + return new ScalarizeMaskedMemIntrin(); +} + +// Translate a masked load intrinsic like +// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align, +// <16 x i1> %mask, <16 x i32> %passthru) +// to a chain of basic blocks, with loading element one-by-one if +// the appropriate mask bit is set +// +// %1 = bitcast i8* %addr to i32* +// %2 = extractelement <16 x i1> %mask, i32 0 +// %3 = icmp eq i1 %2, true +// br i1 %3, label %cond.load, label %else +// +// cond.load: ; preds = %0 +// %4 = getelementptr i32* %1, i32 0 +// %5 = load i32* %4 +// %6 = insertelement <16 x i32> undef, i32 %5, i32 0 +// br label %else +// +// else: ; preds = %0, %cond.load +// %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ] +// %7 = extractelement <16 x i1> %mask, i32 1 +// %8 = icmp eq i1 %7, true +// br i1 %8, label %cond.load1, label %else2 +// +// cond.load1: ; preds = %else +// %9 = getelementptr i32* %1, i32 1 +// %10 = load i32* %9 +// %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1 +// br label %else2 +// +// else2: ; preds = %else, %cond.load1 +// %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] +// %12 = extractelement <16 x i1> %mask, i32 2 +// %13 = icmp eq i1 %12, true +// br i1 %13, label %cond.load4, label %else5 +// +static void scalarizeMaskedLoad(CallInst *CI) { + Value *Ptr = CI->getArgOperand(0); + Value *Alignment = CI->getArgOperand(1); + Value *Mask = CI->getArgOperand(2); + Value *Src0 = CI->getArgOperand(3); + + unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); + VectorType *VecType = dyn_cast<VectorType>(CI->getType()); + assert(VecType && "Unexpected return type of masked load intrinsic"); + + Type *EltTy = CI->getType()->getVectorElementType(); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + BasicBlock *CondBlock = nullptr; + BasicBlock *PrevIfBlock = CI->getParent(); + + Builder.SetInsertPoint(InsertPt); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + // Short-cut if the mask is all-true. + bool IsAllOnesMask = + isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue(); + + if (IsAllOnesMask) { + Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); + return; + } + + // Adjust alignment for the scalar instruction. + AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits() / 8); + // Bitcast %addr fron i8* to EltTy* + Type *NewPtrType = + EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace()); + Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); + unsigned VectorWidth = VecType->getNumElements(); + + Value *UndefVal = UndefValue::get(VecType); + + // The result vector + Value *VResult = UndefVal; + + if (isa<ConstantVector>(Mask)) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *Gep = + Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); + LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal); + VResult = + Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx)); + } + Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); + return; + } + + PHINode *Phi = nullptr; + Value *PrevPhi = UndefVal; + + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + + // Fill the "else" block, created in the previous iteration + // + // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] + // %mask_1 = extractelement <16 x i1> %mask, i32 Idx + // %to_load = icmp eq i1 %mask_1, true + // br i1 %to_load, label %cond.load, label %else + // + if (Idx > 0) { + Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + PrevPhi = Phi; + VResult = Phi; + } + + Value *Predicate = + Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1)); + + // Create "cond" block + // + // %EltAddr = getelementptr i32* %1, i32 0 + // %Elt = load i32* %EltAddr + // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx + // + CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load"); + Builder.SetInsertPoint(InsertPt); + + Value *Gep = + Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); + LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal); + VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx)); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = + CondBlock->splitBasicBlock(InsertPt->getIterator(), "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + PrevIfBlock = IfBlock; + IfBlock = NewIfBlock; + } + + Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); +} + +// Translate a masked store intrinsic, like +// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align, +// <16 x i1> %mask) +// to a chain of basic blocks, that stores element one-by-one if +// the appropriate mask bit is set +// +// %1 = bitcast i8* %addr to i32* +// %2 = extractelement <16 x i1> %mask, i32 0 +// %3 = icmp eq i1 %2, true +// br i1 %3, label %cond.store, label %else +// +// cond.store: ; preds = %0 +// %4 = extractelement <16 x i32> %val, i32 0 +// %5 = getelementptr i32* %1, i32 0 +// store i32 %4, i32* %5 +// br label %else +// +// else: ; preds = %0, %cond.store +// %6 = extractelement <16 x i1> %mask, i32 1 +// %7 = icmp eq i1 %6, true +// br i1 %7, label %cond.store1, label %else2 +// +// cond.store1: ; preds = %else +// %8 = extractelement <16 x i32> %val, i32 1 +// %9 = getelementptr i32* %1, i32 1 +// store i32 %8, i32* %9 +// br label %else2 +// . . . +static void scalarizeMaskedStore(CallInst *CI) { + Value *Src = CI->getArgOperand(0); + Value *Ptr = CI->getArgOperand(1); + Value *Alignment = CI->getArgOperand(2); + Value *Mask = CI->getArgOperand(3); + + unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); + VectorType *VecType = dyn_cast<VectorType>(Src->getType()); + assert(VecType && "Unexpected data type in masked store intrinsic"); + + Type *EltTy = VecType->getElementType(); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + Builder.SetInsertPoint(InsertPt); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + // Short-cut if the mask is all-true. + bool IsAllOnesMask = + isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue(); + + if (IsAllOnesMask) { + Builder.CreateAlignedStore(Src, Ptr, AlignVal); + CI->eraseFromParent(); + return; + } + + // Adjust alignment for the scalar instruction. + AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits() / 8); + // Bitcast %addr fron i8* to EltTy* + Type *NewPtrType = + EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace()); + Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); + unsigned VectorWidth = VecType->getNumElements(); + + if (isa<ConstantVector>(Mask)) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); + Value *Gep = + Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); + Builder.CreateAlignedStore(OneElt, Gep, AlignVal); + } + CI->eraseFromParent(); + return; + } + + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + + // Fill the "else" block, created in the previous iteration + // + // %mask_1 = extractelement <16 x i1> %mask, i32 Idx + // %to_store = icmp eq i1 %mask_1, true + // br i1 %to_store, label %cond.store, label %else + // + Value *Predicate = + Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1)); + + // Create "cond" block + // + // %OneElt = extractelement <16 x i32> %Src, i32 Idx + // %EltAddr = getelementptr i32* %1, i32 0 + // %store i32 %OneElt, i32* %EltAddr + // + BasicBlock *CondBlock = + IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store"); + Builder.SetInsertPoint(InsertPt); + + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); + Value *Gep = + Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); + Builder.CreateAlignedStore(OneElt, Gep, AlignVal); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = + CondBlock->splitBasicBlock(InsertPt->getIterator(), "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + IfBlock = NewIfBlock; + } + CI->eraseFromParent(); +} + +// Translate a masked gather intrinsic like +// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4, +// <16 x i1> %Mask, <16 x i32> %Src) +// to a chain of basic blocks, with loading element one-by-one if +// the appropriate mask bit is set +// +// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind +// % Mask0 = extractelement <16 x i1> %Mask, i32 0 +// % ToLoad0 = icmp eq i1 % Mask0, true +// br i1 % ToLoad0, label %cond.load, label %else +// +// cond.load: +// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 +// % Load0 = load i32, i32* % Ptr0, align 4 +// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0 +// br label %else +// +// else: +// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0] +// % Mask1 = extractelement <16 x i1> %Mask, i32 1 +// % ToLoad1 = icmp eq i1 % Mask1, true +// br i1 % ToLoad1, label %cond.load1, label %else2 +// +// cond.load1: +// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 +// % Load1 = load i32, i32* % Ptr1, align 4 +// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1 +// br label %else2 +// . . . +// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src +// ret <16 x i32> %Result +static void scalarizeMaskedGather(CallInst *CI) { + Value *Ptrs = CI->getArgOperand(0); + Value *Alignment = CI->getArgOperand(1); + Value *Mask = CI->getArgOperand(2); + Value *Src0 = CI->getArgOperand(3); + + VectorType *VecType = dyn_cast<VectorType>(CI->getType()); + + assert(VecType && "Unexpected return type of masked load intrinsic"); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + BasicBlock *CondBlock = nullptr; + BasicBlock *PrevIfBlock = CI->getParent(); + Builder.SetInsertPoint(InsertPt); + unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); + + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + Value *UndefVal = UndefValue::get(VecType); + + // The result vector + Value *VResult = UndefVal; + unsigned VectorWidth = VecType->getNumElements(); + + // Shorten the way if the mask is a vector of constants. + bool IsConstMask = isa<ConstantVector>(Mask); + + if (IsConstMask) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + LoadInst *Load = + Builder.CreateAlignedLoad(Ptr, AlignVal, "Load" + Twine(Idx)); + VResult = Builder.CreateInsertElement( + VResult, Load, Builder.getInt32(Idx), "Res" + Twine(Idx)); + } + Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); + return; + } + + PHINode *Phi = nullptr; + Value *PrevPhi = UndefVal; + + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + + // Fill the "else" block, created in the previous iteration + // + // %Mask1 = extractelement <16 x i1> %Mask, i32 1 + // %ToLoad1 = icmp eq i1 %Mask1, true + // br i1 %ToLoad1, label %cond.load, label %else + // + if (Idx > 0) { + Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + PrevPhi = Phi; + VResult = Phi; + } + + Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx), + "Mask" + Twine(Idx)); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1), + "ToLoad" + Twine(Idx)); + + // Create "cond" block + // + // %EltAddr = getelementptr i32* %1, i32 0 + // %Elt = load i32* %EltAddr + // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx + // + CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load"); + Builder.SetInsertPoint(InsertPt); + + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + LoadInst *Load = + Builder.CreateAlignedLoad(Ptr, AlignVal, "Load" + Twine(Idx)); + VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx), + "Res" + Twine(Idx)); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + PrevIfBlock = IfBlock; + IfBlock = NewIfBlock; + } + + Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); +} + +// Translate a masked scatter intrinsic, like +// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4, +// <16 x i1> %Mask) +// to a chain of basic blocks, that stores element one-by-one if +// the appropriate mask bit is set. +// +// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind +// % Mask0 = extractelement <16 x i1> % Mask, i32 0 +// % ToStore0 = icmp eq i1 % Mask0, true +// br i1 %ToStore0, label %cond.store, label %else +// +// cond.store: +// % Elt0 = extractelement <16 x i32> %Src, i32 0 +// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 +// store i32 %Elt0, i32* % Ptr0, align 4 +// br label %else +// +// else: +// % Mask1 = extractelement <16 x i1> % Mask, i32 1 +// % ToStore1 = icmp eq i1 % Mask1, true +// br i1 % ToStore1, label %cond.store1, label %else2 +// +// cond.store1: +// % Elt1 = extractelement <16 x i32> %Src, i32 1 +// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 +// store i32 % Elt1, i32* % Ptr1, align 4 +// br label %else2 +// . . . +static void scalarizeMaskedScatter(CallInst *CI) { + Value *Src = CI->getArgOperand(0); + Value *Ptrs = CI->getArgOperand(1); + Value *Alignment = CI->getArgOperand(2); + Value *Mask = CI->getArgOperand(3); + + assert(isa<VectorType>(Src->getType()) && + "Unexpected data type in masked scatter intrinsic"); + assert(isa<VectorType>(Ptrs->getType()) && + isa<PointerType>(Ptrs->getType()->getVectorElementType()) && + "Vector of pointers is expected in masked scatter intrinsic"); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + Builder.SetInsertPoint(InsertPt); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); + unsigned VectorWidth = Src->getType()->getVectorNumElements(); + + // Shorten the way if the mask is a vector of constants. + bool IsConstMask = isa<ConstantVector>(Mask); + + if (IsConstMask) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), + "Elt" + Twine(Idx)); + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); + } + CI->eraseFromParent(); + return; + } + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + // Fill the "else" block, created in the previous iteration + // + // % Mask1 = extractelement <16 x i1> % Mask, i32 Idx + // % ToStore = icmp eq i1 % Mask1, true + // br i1 % ToStore, label %cond.store, label %else + // + Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx), + "Mask" + Twine(Idx)); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1), + "ToStore" + Twine(Idx)); + + // Create "cond" block + // + // % Elt1 = extractelement <16 x i32> %Src, i32 1 + // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 + // %store i32 % Elt1, i32* % Ptr1 + // + BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); + Builder.SetInsertPoint(InsertPt); + + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), + "Elt" + Twine(Idx)); + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + IfBlock = NewIfBlock; + } + CI->eraseFromParent(); +} + +bool ScalarizeMaskedMemIntrin::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + bool EverMadeChange = false; + + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + + bool MadeChange = true; + while (MadeChange) { + MadeChange = false; + for (Function::iterator I = F.begin(); I != F.end();) { + BasicBlock *BB = &*I++; + bool ModifiedDTOnIteration = false; + MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration); + + // Restart BB iteration if the dominator tree of the Function was changed + if (ModifiedDTOnIteration) + break; + } + + EverMadeChange |= MadeChange; + } + + return EverMadeChange; +} + +bool ScalarizeMaskedMemIntrin::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) { + bool MadeChange = false; + + BasicBlock::iterator CurInstIterator = BB.begin(); + while (CurInstIterator != BB.end()) { + if (CallInst *CI = dyn_cast<CallInst>(&*CurInstIterator++)) + MadeChange |= optimizeCallInst(CI, ModifiedDT); + if (ModifiedDT) + return true; + } + + return MadeChange; +} + +bool ScalarizeMaskedMemIntrin::optimizeCallInst(CallInst *CI, + bool &ModifiedDT) { + + IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); + if (II) { + switch (II->getIntrinsicID()) { + default: + break; + case Intrinsic::masked_load: { + // Scalarize unsupported vector masked load + if (!TTI->isLegalMaskedLoad(CI->getType())) { + scalarizeMaskedLoad(CI); + ModifiedDT = true; + return true; + } + return false; + } + case Intrinsic::masked_store: { + if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) { + scalarizeMaskedStore(CI); + ModifiedDT = true; + return true; + } + return false; + } + case Intrinsic::masked_gather: { + if (!TTI->isLegalMaskedGather(CI->getType())) { + scalarizeMaskedGather(CI); + ModifiedDT = true; + return true; + } + return false; + } + case Intrinsic::masked_scatter: { + if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) { + scalarizeMaskedScatter(CI); + ModifiedDT = true; + return true; + } + return false; + } + } + } + + return false; +} diff --git a/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp b/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp index 427d952..5e95f76 100644 --- a/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp +++ b/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp @@ -1,4 +1,4 @@ -//===---- ScheduleDAG.cpp - Implement the ScheduleDAG class ---------------===// +//===- ScheduleDAG.cpp - Implement the ScheduleDAG class ------------------===// // // The LLVM Compiler Infrastructure // @@ -7,22 +7,32 @@ // //===----------------------------------------------------------------------===// // -// This implements the ScheduleDAG class, which is a base class used by -// scheduling implementation classes. +/// \file Implements the ScheduleDAG class, which is a base class used by +/// scheduling implementation classes. // //===----------------------------------------------------------------------===// #include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/ScheduleHazardRecognizer.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" -#include <climits> +#include <algorithm> +#include <cassert> +#include <iterator> +#include <limits> +#include <utility> +#include <vector> + using namespace llvm; #define DEBUG_TYPE "pre-RA-sched" @@ -33,58 +43,87 @@ static cl::opt<bool> StressSchedOpt( cl::desc("Stress test instruction scheduling")); #endif -void SchedulingPriorityQueue::anchor() { } +void SchedulingPriorityQueue::anchor() {} ScheduleDAG::ScheduleDAG(MachineFunction &mf) : TM(mf.getTarget()), TII(mf.getSubtarget().getInstrInfo()), TRI(mf.getSubtarget().getRegisterInfo()), MF(mf), - MRI(mf.getRegInfo()), EntrySU(), ExitSU() { + MRI(mf.getRegInfo()) { #ifndef NDEBUG StressSched = StressSchedOpt; #endif } -ScheduleDAG::~ScheduleDAG() {} +ScheduleDAG::~ScheduleDAG() = default; -/// Clear the DAG state (e.g. between scheduling regions). void ScheduleDAG::clearDAG() { SUnits.clear(); EntrySU = SUnit(); ExitSU = SUnit(); } -/// getInstrDesc helper to handle SDNodes. const MCInstrDesc *ScheduleDAG::getNodeDesc(const SDNode *Node) const { if (!Node || !Node->isMachineOpcode()) return nullptr; return &TII->get(Node->getMachineOpcode()); } -/// addPred - This adds the specified edge as a pred of the current node if -/// not already. It also adds the current node as a successor of the -/// specified node. +LLVM_DUMP_METHOD +raw_ostream &SDep::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const { + switch (getKind()) { + case Data: OS << "Data"; break; + case Anti: OS << "Anti"; break; + case Output: OS << "Out "; break; + case Order: OS << "Ord "; break; + } + + switch (getKind()) { + case Data: + OS << " Latency=" << getLatency(); + if (TRI && isAssignedRegDep()) + OS << " Reg=" << PrintReg(getReg(), TRI); + break; + case Anti: + case Output: + OS << " Latency=" << getLatency(); + break; + case Order: + OS << " Latency=" << getLatency(); + switch(Contents.OrdKind) { + case Barrier: OS << " Barrier"; break; + case MayAliasMem: + case MustAliasMem: OS << " Memory"; break; + case Artificial: OS << " Artificial"; break; + case Weak: OS << " Weak"; break; + case Cluster: OS << " Cluster"; break; + } + break; + } + + return OS; +} + bool SUnit::addPred(const SDep &D, bool Required) { // If this node already has this dependence, don't add a redundant one. - for (SmallVectorImpl<SDep>::iterator I = Preds.begin(), E = Preds.end(); - I != E; ++I) { + for (SDep &PredDep : Preds) { // Zero-latency weak edges may be added purely for heuristic ordering. Don't // add them if another kind of edge already exists. - if (!Required && I->getSUnit() == D.getSUnit()) + if (!Required && PredDep.getSUnit() == D.getSUnit()) return false; - if (I->overlaps(D)) { - // Extend the latency if needed. Equivalent to removePred(I) + addPred(D). - if (I->getLatency() < D.getLatency()) { - SUnit *PredSU = I->getSUnit(); + if (PredDep.overlaps(D)) { + // Extend the latency if needed. Equivalent to + // removePred(PredDep) + addPred(D). + if (PredDep.getLatency() < D.getLatency()) { + SUnit *PredSU = PredDep.getSUnit(); // Find the corresponding successor in N. - SDep ForwardD = *I; + SDep ForwardD = PredDep; ForwardD.setSUnit(this); - for (SmallVectorImpl<SDep>::iterator II = PredSU->Succs.begin(), - EE = PredSU->Succs.end(); II != EE; ++II) { - if (*II == ForwardD) { - II->setLatency(D.getLatency()); + for (SDep &SuccDep : PredSU->Succs) { + if (SuccDep == ForwardD) { + SuccDep.setLatency(D.getLatency()); break; } } - I->setLatency(D.getLatency()); + PredDep.setLatency(D.getLatency()); } return false; } @@ -95,8 +134,10 @@ bool SUnit::addPred(const SDep &D, bool Required) { SUnit *N = D.getSUnit(); // Update the bookkeeping. if (D.getKind() == SDep::Data) { - assert(NumPreds < UINT_MAX && "NumPreds will overflow!"); - assert(N->NumSuccs < UINT_MAX && "NumSuccs will overflow!"); + assert(NumPreds < std::numeric_limits<unsigned>::max() && + "NumPreds will overflow!"); + assert(N->NumSuccs < std::numeric_limits<unsigned>::max() && + "NumSuccs will overflow!"); ++NumPreds; ++N->NumSuccs; } @@ -105,7 +146,8 @@ bool SUnit::addPred(const SDep &D, bool Required) { ++WeakPredsLeft; } else { - assert(NumPredsLeft < UINT_MAX && "NumPredsLeft will overflow!"); + assert(NumPredsLeft < std::numeric_limits<unsigned>::max() && + "NumPredsLeft will overflow!"); ++NumPredsLeft; } } @@ -114,7 +156,8 @@ bool SUnit::addPred(const SDep &D, bool Required) { ++N->WeakSuccsLeft; } else { - assert(N->NumSuccsLeft < UINT_MAX && "NumSuccsLeft will overflow!"); + assert(N->NumSuccsLeft < std::numeric_limits<unsigned>::max() && + "NumSuccsLeft will overflow!"); ++N->NumSuccsLeft; } } @@ -127,51 +170,46 @@ bool SUnit::addPred(const SDep &D, bool Required) { return true; } -/// removePred - This removes the specified edge as a pred of the current -/// node if it exists. It also removes the current node as a successor of -/// the specified node. void SUnit::removePred(const SDep &D) { // Find the matching predecessor. - for (SmallVectorImpl<SDep>::iterator I = Preds.begin(), E = Preds.end(); - I != E; ++I) - if (*I == D) { - // Find the corresponding successor in N. - SDep P = D; - P.setSUnit(this); - SUnit *N = D.getSUnit(); - SmallVectorImpl<SDep>::iterator Succ = find(N->Succs, P); - assert(Succ != N->Succs.end() && "Mismatching preds / succs lists!"); - N->Succs.erase(Succ); - Preds.erase(I); - // Update the bookkeeping. - if (P.getKind() == SDep::Data) { - assert(NumPreds > 0 && "NumPreds will underflow!"); - assert(N->NumSuccs > 0 && "NumSuccs will underflow!"); - --NumPreds; - --N->NumSuccs; - } - if (!N->isScheduled) { - if (D.isWeak()) - --WeakPredsLeft; - else { - assert(NumPredsLeft > 0 && "NumPredsLeft will underflow!"); - --NumPredsLeft; - } - } - if (!isScheduled) { - if (D.isWeak()) - --N->WeakSuccsLeft; - else { - assert(N->NumSuccsLeft > 0 && "NumSuccsLeft will underflow!"); - --N->NumSuccsLeft; - } - } - if (P.getLatency() != 0) { - this->setDepthDirty(); - N->setHeightDirty(); - } - return; + SmallVectorImpl<SDep>::iterator I = llvm::find(Preds, D); + if (I == Preds.end()) + return; + // Find the corresponding successor in N. + SDep P = D; + P.setSUnit(this); + SUnit *N = D.getSUnit(); + SmallVectorImpl<SDep>::iterator Succ = llvm::find(N->Succs, P); + assert(Succ != N->Succs.end() && "Mismatching preds / succs lists!"); + N->Succs.erase(Succ); + Preds.erase(I); + // Update the bookkeeping. + if (P.getKind() == SDep::Data) { + assert(NumPreds > 0 && "NumPreds will underflow!"); + assert(N->NumSuccs > 0 && "NumSuccs will underflow!"); + --NumPreds; + --N->NumSuccs; + } + if (!N->isScheduled) { + if (D.isWeak()) + --WeakPredsLeft; + else { + assert(NumPredsLeft > 0 && "NumPredsLeft will underflow!"); + --NumPredsLeft; } + } + if (!isScheduled) { + if (D.isWeak()) + --N->WeakSuccsLeft; + else { + assert(N->NumSuccsLeft > 0 && "NumSuccsLeft will underflow!"); + --N->NumSuccsLeft; + } + } + if (P.getLatency() != 0) { + this->setDepthDirty(); + N->setHeightDirty(); + } } void SUnit::setDepthDirty() { @@ -181,9 +219,8 @@ void SUnit::setDepthDirty() { do { SUnit *SU = WorkList.pop_back_val(); SU->isDepthCurrent = false; - for (SUnit::const_succ_iterator I = SU->Succs.begin(), - E = SU->Succs.end(); I != E; ++I) { - SUnit *SuccSU = I->getSUnit(); + for (SDep &SuccDep : SU->Succs) { + SUnit *SuccSU = SuccDep.getSUnit(); if (SuccSU->isDepthCurrent) WorkList.push_back(SuccSU); } @@ -197,18 +234,14 @@ void SUnit::setHeightDirty() { do { SUnit *SU = WorkList.pop_back_val(); SU->isHeightCurrent = false; - for (SUnit::const_pred_iterator I = SU->Preds.begin(), - E = SU->Preds.end(); I != E; ++I) { - SUnit *PredSU = I->getSUnit(); + for (SDep &PredDep : SU->Preds) { + SUnit *PredSU = PredDep.getSUnit(); if (PredSU->isHeightCurrent) WorkList.push_back(PredSU); } } while (!WorkList.empty()); } -/// setDepthToAtLeast - Update this node's successors to reflect the -/// fact that this node's depth just increased. -/// void SUnit::setDepthToAtLeast(unsigned NewDepth) { if (NewDepth <= getDepth()) return; @@ -217,9 +250,6 @@ void SUnit::setDepthToAtLeast(unsigned NewDepth) { isDepthCurrent = true; } -/// setHeightToAtLeast - Update this node's predecessors to reflect the -/// fact that this node's height just increased. -/// void SUnit::setHeightToAtLeast(unsigned NewHeight) { if (NewHeight <= getHeight()) return; @@ -228,8 +258,7 @@ void SUnit::setHeightToAtLeast(unsigned NewHeight) { isHeightCurrent = true; } -/// ComputeDepth - Calculate the maximal path from the node to the exit. -/// +/// Calculates the maximal path from the node to the exit. void SUnit::ComputeDepth() { SmallVector<SUnit*, 8> WorkList; WorkList.push_back(this); @@ -238,12 +267,11 @@ void SUnit::ComputeDepth() { bool Done = true; unsigned MaxPredDepth = 0; - for (SUnit::const_pred_iterator I = Cur->Preds.begin(), - E = Cur->Preds.end(); I != E; ++I) { - SUnit *PredSU = I->getSUnit(); + for (const SDep &PredDep : Cur->Preds) { + SUnit *PredSU = PredDep.getSUnit(); if (PredSU->isDepthCurrent) MaxPredDepth = std::max(MaxPredDepth, - PredSU->Depth + I->getLatency()); + PredSU->Depth + PredDep.getLatency()); else { Done = false; WorkList.push_back(PredSU); @@ -261,8 +289,7 @@ void SUnit::ComputeDepth() { } while (!WorkList.empty()); } -/// ComputeHeight - Calculate the maximal path from the node to the entry. -/// +/// Calculates the maximal path from the node to the entry. void SUnit::ComputeHeight() { SmallVector<SUnit*, 8> WorkList; WorkList.push_back(this); @@ -271,12 +298,11 @@ void SUnit::ComputeHeight() { bool Done = true; unsigned MaxSuccHeight = 0; - for (SUnit::const_succ_iterator I = Cur->Succs.begin(), - E = Cur->Succs.end(); I != E; ++I) { - SUnit *SuccSU = I->getSUnit(); + for (const SDep &SuccDep : Cur->Succs) { + SUnit *SuccSU = SuccDep.getSUnit(); if (SuccSU->isHeightCurrent) MaxSuccHeight = std::max(MaxSuccHeight, - SuccSU->Height + I->getLatency()); + SuccSU->Height + SuccDep.getLatency()); else { Done = false; WorkList.push_back(SuccSU); @@ -310,24 +336,31 @@ void SUnit::biasCriticalPath() { } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void SUnit::print(raw_ostream &OS, const ScheduleDAG *DAG) const { - if (this == &DAG->ExitSU) - OS << "ExitSU"; - else if (this == &DAG->EntrySU) +LLVM_DUMP_METHOD +raw_ostream &SUnit::print(raw_ostream &OS, + const SUnit *Entry, const SUnit *Exit) const { + if (this == Entry) OS << "EntrySU"; + else if (this == Exit) + OS << "ExitSU"; else OS << "SU(" << NodeNum << ")"; + return OS; +} + +LLVM_DUMP_METHOD +raw_ostream &SUnit::print(raw_ostream &OS, const ScheduleDAG *G) const { + return print(OS, &G->EntrySU, &G->ExitSU); } -/// SUnit - Scheduling unit. It's an wrapper around either a single SDNode or -/// a group of nodes flagged together. +LLVM_DUMP_METHOD void SUnit::dump(const ScheduleDAG *G) const { print(dbgs(), G); dbgs() << ": "; G->dumpNode(this); } -void SUnit::dumpAll(const ScheduleDAG *G) const { +LLVM_DUMP_METHOD void SUnit::dumpAll(const ScheduleDAG *G) const { dump(G); dbgs() << " # preds left : " << NumPredsLeft << "\n"; @@ -343,89 +376,62 @@ void SUnit::dumpAll(const ScheduleDAG *G) const { if (Preds.size() != 0) { dbgs() << " Predecessors:\n"; - for (SUnit::const_succ_iterator I = Preds.begin(), E = Preds.end(); - I != E; ++I) { - dbgs() << " "; - switch (I->getKind()) { - case SDep::Data: dbgs() << "data "; break; - case SDep::Anti: dbgs() << "anti "; break; - case SDep::Output: dbgs() << "out "; break; - case SDep::Order: dbgs() << "ord "; break; - } - I->getSUnit()->print(dbgs(), G); - if (I->isArtificial()) - dbgs() << " *"; - dbgs() << ": Latency=" << I->getLatency(); - if (I->isAssignedRegDep()) - dbgs() << " Reg=" << PrintReg(I->getReg(), G->TRI); - dbgs() << "\n"; + for (const SDep &Dep : Preds) { + dbgs() << " "; + Dep.getSUnit()->print(dbgs(), G); dbgs() << ": "; + Dep.print(dbgs(), G->TRI); dbgs() << '\n'; } } if (Succs.size() != 0) { dbgs() << " Successors:\n"; - for (SUnit::const_succ_iterator I = Succs.begin(), E = Succs.end(); - I != E; ++I) { - dbgs() << " "; - switch (I->getKind()) { - case SDep::Data: dbgs() << "data "; break; - case SDep::Anti: dbgs() << "anti "; break; - case SDep::Output: dbgs() << "out "; break; - case SDep::Order: dbgs() << "ord "; break; - } - I->getSUnit()->print(dbgs(), G); - if (I->isArtificial()) - dbgs() << " *"; - dbgs() << ": Latency=" << I->getLatency(); - if (I->isAssignedRegDep()) - dbgs() << " Reg=" << PrintReg(I->getReg(), G->TRI); - dbgs() << "\n"; + for (const SDep &Dep : Succs) { + dbgs() << " "; + Dep.getSUnit()->print(dbgs(), G); dbgs() << ": "; + Dep.print(dbgs(), G->TRI); dbgs() << '\n'; } } } #endif #ifndef NDEBUG -/// VerifyScheduledDAG - Verify that all SUnits were scheduled and that -/// their state is consistent. Return the number of scheduled nodes. -/// unsigned ScheduleDAG::VerifyScheduledDAG(bool isBottomUp) { bool AnyNotSched = false; unsigned DeadNodes = 0; - for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { - if (!SUnits[i].isScheduled) { - if (SUnits[i].NumPreds == 0 && SUnits[i].NumSuccs == 0) { + for (const SUnit &SUnit : SUnits) { + if (!SUnit.isScheduled) { + if (SUnit.NumPreds == 0 && SUnit.NumSuccs == 0) { ++DeadNodes; continue; } if (!AnyNotSched) dbgs() << "*** Scheduling failed! ***\n"; - SUnits[i].dump(this); + SUnit.dump(this); dbgs() << "has not been scheduled!\n"; AnyNotSched = true; } - if (SUnits[i].isScheduled && - (isBottomUp ? SUnits[i].getHeight() : SUnits[i].getDepth()) > - unsigned(INT_MAX)) { + if (SUnit.isScheduled && + (isBottomUp ? SUnit.getHeight() : SUnit.getDepth()) > + unsigned(std::numeric_limits<int>::max())) { if (!AnyNotSched) dbgs() << "*** Scheduling failed! ***\n"; - SUnits[i].dump(this); + SUnit.dump(this); dbgs() << "has an unexpected " << (isBottomUp ? "Height" : "Depth") << " value!\n"; AnyNotSched = true; } if (isBottomUp) { - if (SUnits[i].NumSuccsLeft != 0) { + if (SUnit.NumSuccsLeft != 0) { if (!AnyNotSched) dbgs() << "*** Scheduling failed! ***\n"; - SUnits[i].dump(this); + SUnit.dump(this); dbgs() << "has successors left!\n"; AnyNotSched = true; } } else { - if (SUnits[i].NumPredsLeft != 0) { + if (SUnit.NumPredsLeft != 0) { if (!AnyNotSched) dbgs() << "*** Scheduling failed! ***\n"; - SUnits[i].dump(this); + SUnit.dump(this); dbgs() << "has predecessors left!\n"; AnyNotSched = true; } @@ -436,36 +442,33 @@ unsigned ScheduleDAG::VerifyScheduledDAG(bool isBottomUp) { } #endif -/// InitDAGTopologicalSorting - create the initial topological -/// ordering from the DAG to be scheduled. -/// -/// The idea of the algorithm is taken from -/// "Online algorithms for managing the topological order of -/// a directed acyclic graph" by David J. Pearce and Paul H.J. Kelly -/// This is the MNR algorithm, which was first introduced by -/// A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in -/// "Maintaining a topological order under edge insertions". -/// -/// Short description of the algorithm: -/// -/// Topological ordering, ord, of a DAG maps each node to a topological -/// index so that for all edges X->Y it is the case that ord(X) < ord(Y). -/// -/// This means that if there is a path from the node X to the node Z, -/// then ord(X) < ord(Z). -/// -/// This property can be used to check for reachability of nodes: -/// if Z is reachable from X, then an insertion of the edge Z->X would -/// create a cycle. -/// -/// The algorithm first computes a topological ordering for the DAG by -/// initializing the Index2Node and Node2Index arrays and then tries to keep -/// the ordering up-to-date after edge insertions by reordering the DAG. -/// -/// On insertion of the edge X->Y, the algorithm first marks by calling DFS -/// the nodes reachable from Y, and then shifts them using Shift to lie -/// immediately after X in Index2Node. void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() { + // The idea of the algorithm is taken from + // "Online algorithms for managing the topological order of + // a directed acyclic graph" by David J. Pearce and Paul H.J. Kelly + // This is the MNR algorithm, which was first introduced by + // A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in + // "Maintaining a topological order under edge insertions". + // + // Short description of the algorithm: + // + // Topological ordering, ord, of a DAG maps each node to a topological + // index so that for all edges X->Y it is the case that ord(X) < ord(Y). + // + // This means that if there is a path from the node X to the node Z, + // then ord(X) < ord(Z). + // + // This property can be used to check for reachability of nodes: + // if Z is reachable from X, then an insertion of the edge Z->X would + // create a cycle. + // + // The algorithm first computes a topological ordering for the DAG by + // initializing the Index2Node and Node2Index arrays and then tries to keep + // the ordering up-to-date after edge insertions by reordering the DAG. + // + // On insertion of the edge X->Y, the algorithm first marks by calling DFS + // the nodes reachable from Y, and then shifts them using Shift to lie + // immediately after X in Index2Node. unsigned DAGSize = SUnits.size(); std::vector<SUnit*> WorkList; WorkList.reserve(DAGSize); @@ -476,18 +479,17 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() { // Initialize the data structures. if (ExitSU) WorkList.push_back(ExitSU); - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &SUnits[i]; - int NodeNum = SU->NodeNum; - unsigned Degree = SU->Succs.size(); + for (SUnit &SU : SUnits) { + int NodeNum = SU.NodeNum; + unsigned Degree = SU.Succs.size(); // Temporarily use the Node2Index array as scratch space for degree counts. Node2Index[NodeNum] = Degree; // Is it a node without dependencies? if (Degree == 0) { - assert(SU->Succs.empty() && "SUnit should have no successors"); + assert(SU.Succs.empty() && "SUnit should have no successors"); // Collect leaf nodes. - WorkList.push_back(SU); + WorkList.push_back(&SU); } } @@ -497,9 +499,8 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() { WorkList.pop_back(); if (SU->NodeNum < DAGSize) Allocate(SU->NodeNum, --Id); - for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - SUnit *SU = I->getSUnit(); + for (const SDep &PredDep : SU->Preds) { + SUnit *SU = PredDep.getSUnit(); if (SU->NodeNum < DAGSize && !--Node2Index[SU->NodeNum]) // If all dependencies of the node are processed already, // then the node can be computed now. @@ -511,19 +512,15 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() { #ifndef NDEBUG // Check correctness of the ordering - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &SUnits[i]; - for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - assert(Node2Index[SU->NodeNum] > Node2Index[I->getSUnit()->NodeNum] && + for (SUnit &SU : SUnits) { + for (const SDep &PD : SU.Preds) { + assert(Node2Index[SU.NodeNum] > Node2Index[PD.getSUnit()->NodeNum] && "Wrong topological sorting"); } } #endif } -/// AddPred - Updates the topological ordering to accommodate an edge -/// to be added from SUnit X to SUnit Y. void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) { int UpperBound, LowerBound; LowerBound = Node2Index[Y->NodeNum]; @@ -540,16 +537,10 @@ void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) { } } -/// RemovePred - Updates the topological ordering to accommodate an -/// an edge to be removed from the specified node N from the predecessors -/// of the current node M. void ScheduleDAGTopologicalSort::RemovePred(SUnit *M, SUnit *N) { // InitDAGTopologicalSorting(); } -/// DFS - Make a DFS traversal to mark all nodes reachable from SU and mark -/// all nodes affected by the edge insertion. These nodes will later get new -/// topological indexes by means of the Shift method. void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound, bool &HasLoop) { std::vector<const SUnit*> WorkList; @@ -560,8 +551,9 @@ void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound, SU = WorkList.back(); WorkList.pop_back(); Visited.set(SU->NodeNum); - for (int I = SU->Succs.size()-1; I >= 0; --I) { - unsigned s = SU->Succs[I].getSUnit()->NodeNum; + for (const SDep &SuccDep + : make_range(SU->Succs.rbegin(), SU->Succs.rend())) { + unsigned s = SuccDep.getSUnit()->NodeNum; // Edges to non-SUnits are allowed but ignored (e.g. ExitSU). if (s >= Node2Index.size()) continue; @@ -571,14 +563,93 @@ void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound, } // Visit successors if not already and in affected region. if (!Visited.test(s) && Node2Index[s] < UpperBound) { - WorkList.push_back(SU->Succs[I].getSUnit()); + WorkList.push_back(SuccDep.getSUnit()); + } + } + } while (!WorkList.empty()); +} + +std::vector<int> ScheduleDAGTopologicalSort::GetSubGraph(const SUnit &StartSU, + const SUnit &TargetSU, + bool &Success) { + std::vector<const SUnit*> WorkList; + int LowerBound = Node2Index[StartSU.NodeNum]; + int UpperBound = Node2Index[TargetSU.NodeNum]; + bool Found = false; + BitVector VisitedBack; + std::vector<int> Nodes; + + if (LowerBound > UpperBound) { + Success = false; + return Nodes; + } + + WorkList.reserve(SUnits.size()); + Visited.reset(); + + // Starting from StartSU, visit all successors up + // to UpperBound. + WorkList.push_back(&StartSU); + do { + const SUnit *SU = WorkList.back(); + WorkList.pop_back(); + for (int I = SU->Succs.size()-1; I >= 0; --I) { + const SUnit *Succ = SU->Succs[I].getSUnit(); + unsigned s = Succ->NodeNum; + // Edges to non-SUnits are allowed but ignored (e.g. ExitSU). + if (Succ->isBoundaryNode()) + continue; + if (Node2Index[s] == UpperBound) { + Found = true; + continue; + } + // Visit successors if not already and in affected region. + if (!Visited.test(s) && Node2Index[s] < UpperBound) { + Visited.set(s); + WorkList.push_back(Succ); + } + } + } while (!WorkList.empty()); + + if (!Found) { + Success = false; + return Nodes; + } + + WorkList.clear(); + VisitedBack.resize(SUnits.size()); + Found = false; + + // Starting from TargetSU, visit all predecessors up + // to LowerBound. SUs that are visited by the two + // passes are added to Nodes. + WorkList.push_back(&TargetSU); + do { + const SUnit *SU = WorkList.back(); + WorkList.pop_back(); + for (int I = SU->Preds.size()-1; I >= 0; --I) { + const SUnit *Pred = SU->Preds[I].getSUnit(); + unsigned s = Pred->NodeNum; + // Edges to non-SUnits are allowed but ignored (e.g. EntrySU). + if (Pred->isBoundaryNode()) + continue; + if (Node2Index[s] == LowerBound) { + Found = true; + continue; + } + if (!VisitedBack.test(s) && Visited.test(s)) { + VisitedBack.set(s); + WorkList.push_back(Pred); + Nodes.push_back(s); } } } while (!WorkList.empty()); + + assert(Found && "Error in SUnit Graph!"); + Success = true; + return Nodes; } -/// Shift - Renumber the nodes so that the topological ordering is -/// preserved. void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound, int UpperBound) { std::vector<int> L; @@ -598,28 +669,23 @@ void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound, } } - for (unsigned j = 0; j < L.size(); ++j) { - Allocate(L[j], i - shift); + for (unsigned LI : L) { + Allocate(LI, i - shift); i = i + 1; } } - -/// WillCreateCycle - Returns true if adding an edge to TargetSU from SU will -/// create a cycle. If so, it is not safe to call AddPred(TargetSU, SU). bool ScheduleDAGTopologicalSort::WillCreateCycle(SUnit *TargetSU, SUnit *SU) { // Is SU reachable from TargetSU via successor edges? if (IsReachable(SU, TargetSU)) return true; - for (SUnit::pred_iterator - I = TargetSU->Preds.begin(), E = TargetSU->Preds.end(); I != E; ++I) - if (I->isAssignedRegDep() && - IsReachable(SU, I->getSUnit())) + for (const SDep &PredDep : TargetSU->Preds) + if (PredDep.isAssignedRegDep() && + IsReachable(SU, PredDep.getSUnit())) return true; return false; } -/// IsReachable - Checks if SU is reachable from TargetSU. bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU, const SUnit *TargetSU) { // If insertion of the edge SU->TargetSU would create a cycle @@ -637,7 +703,6 @@ bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU, return HasLoop; } -/// Allocate - assign the topological index to the node n. void ScheduleDAGTopologicalSort::Allocate(int n, int index) { Node2Index[n] = index; Index2Node[index] = n; @@ -647,4 +712,4 @@ ScheduleDAGTopologicalSort:: ScheduleDAGTopologicalSort(std::vector<SUnit> &sunits, SUnit *exitsu) : SUnits(sunits), ExitSU(exitsu) {} -ScheduleHazardRecognizer::~ScheduleHazardRecognizer() {} +ScheduleHazardRecognizer::~ScheduleHazardRecognizer() = default; diff --git a/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp index 611c5a7..99baa07 100644 --- a/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -7,41 +7,63 @@ // //===----------------------------------------------------------------------===// // -// This implements the ScheduleDAGInstrs class, which implements re-scheduling -// of MachineInstrs. +/// \file This implements the ScheduleDAGInstrs class, which implements +/// re-scheduling of MachineInstrs. // //===----------------------------------------------------------------------===// #include "llvm/CodeGen/ScheduleDAGInstrs.h" #include "llvm/ADT/IntEqClasses.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SparseSet.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBundle.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/RegisterPressure.h" +#include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/ScheduleDFS.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" -#include "llvm/IR/Type.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <iterator> +#include <string> +#include <utility> +#include <vector> using namespace llvm; -#define DEBUG_TYPE "misched" +#define DEBUG_TYPE "machine-scheduler" static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden, cl::ZeroOrMore, cl::init(false), @@ -90,77 +112,17 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf, const MachineLoopInfo *mli, bool RemoveKillFlags) : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()), - RemoveKillFlags(RemoveKillFlags), CanHandleTerminators(false), - TrackLaneMasks(false), AAForDep(nullptr), BarrierChain(nullptr), + RemoveKillFlags(RemoveKillFlags), UnknownValue(UndefValue::get( - Type::getVoidTy(mf.getFunction()->getContext()))), - FirstDbgValue(nullptr) { + Type::getVoidTy(mf.getFunction()->getContext()))) { DbgValues.clear(); const TargetSubtargetInfo &ST = mf.getSubtarget(); SchedModel.init(ST.getSchedModel(), &ST, TII); } -/// getUnderlyingObjectFromInt - This is the function that does the work of -/// looking through basic ptrtoint+arithmetic+inttoptr sequences. -static const Value *getUnderlyingObjectFromInt(const Value *V) { - do { - if (const Operator *U = dyn_cast<Operator>(V)) { - // If we find a ptrtoint, we can transfer control back to the - // regular getUnderlyingObjectFromInt. - if (U->getOpcode() == Instruction::PtrToInt) - return U->getOperand(0); - // If we find an add of a constant, a multiplied value, or a phi, it's - // likely that the other operand will lead us to the base - // object. We don't have to worry about the case where the - // object address is somehow being computed by the multiply, - // because our callers only care when the result is an - // identifiable object. - if (U->getOpcode() != Instruction::Add || - (!isa<ConstantInt>(U->getOperand(1)) && - Operator::getOpcode(U->getOperand(1)) != Instruction::Mul && - !isa<PHINode>(U->getOperand(1)))) - return V; - V = U->getOperand(0); - } else { - return V; - } - assert(V->getType()->isIntegerTy() && "Unexpected operand type!"); - } while (1); -} - -/// getUnderlyingObjects - This is a wrapper around GetUnderlyingObjects -/// and adds support for basic ptrtoint+arithmetic+inttoptr sequences. -static void getUnderlyingObjects(const Value *V, - SmallVectorImpl<Value *> &Objects, - const DataLayout &DL) { - SmallPtrSet<const Value *, 16> Visited; - SmallVector<const Value *, 4> Working(1, V); - do { - V = Working.pop_back_val(); - - SmallVector<Value *, 4> Objs; - GetUnderlyingObjects(const_cast<Value *>(V), Objs, DL); - - for (Value *V : Objs) { - if (!Visited.insert(V).second) - continue; - if (Operator::getOpcode(V) == Instruction::IntToPtr) { - const Value *O = - getUnderlyingObjectFromInt(cast<User>(V)->getOperand(0)); - if (O->getType()->isPointerTy()) { - Working.push_back(O); - continue; - } - } - Objects.push_back(const_cast<Value *>(V)); - } - } while (!Working.empty()); -} - -/// getUnderlyingObjectsForInstr - If this machine instr has memory reference -/// information and it can be tracked to a normal reference to a known -/// object, return the Value for that object. +/// If this machine instr has memory reference information and it can be tracked +/// to a normal reference to a known object, return the Value for that object. static void getUnderlyingObjectsForInstr(const MachineInstr *MI, const MachineFrameInfo &MFI, UnderlyingObjectsVector &Objects, @@ -189,12 +151,10 @@ static void getUnderlyingObjectsForInstr(const MachineInstr *MI, Objects.push_back(UnderlyingObjectsVector::value_type(PSV, MayAlias)); } else if (const Value *V = MMO->getValue()) { SmallVector<Value *, 4> Objs; - getUnderlyingObjects(V, Objs, DL); + getUnderlyingObjectsForCodeGen(V, Objs, DL); for (Value *V : Objs) { - if (!isIdentifiedObject(V)) - return false; - + assert(isIdentifiedObject(V)); Objects.push_back(UnderlyingObjectsVector::value_type(V, true)); } } else @@ -216,10 +176,6 @@ void ScheduleDAGInstrs::finishBlock() { BB = nullptr; } -/// Initialize the DAG and common scheduler state for the current scheduling -/// region. This does not actually create the DAG, only clears it. The -/// scheduling driver may call BuildSchedGraph multiple times per scheduling -/// region. void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb, MachineBasicBlock::iterator begin, MachineBasicBlock::iterator end, @@ -230,20 +186,10 @@ void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb, NumRegionInstrs = regioninstrs; } -/// Close the current scheduling region. Don't clear any state in case the -/// driver wants to refer to the previous scheduling region. void ScheduleDAGInstrs::exitRegion() { // Nothing to do. } -/// addSchedBarrierDeps - Add dependencies from instructions in the current -/// list of instructions being scheduled to scheduling barrier by adding -/// the exit SU to the register defs and use list. This is because we want to -/// make sure instructions which define registers that are either used by -/// the terminator or are live-out are properly scheduled. This is -/// especially important when the definition latency of the return value(s) -/// are too high to be hidden by the branch or when the liveout registers -/// used by instructions in the fallthrough block. void ScheduleDAGInstrs::addSchedBarrierDeps() { MachineInstr *ExitMI = RegionEnd != BB->end() ? &*RegionEnd : nullptr; ExitSU.setInstr(ExitMI); @@ -271,7 +217,7 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() { } } -/// MO is an operand of SU's instruction that defines a physical register. Add +/// MO is an operand of SU's instruction that defines a physical register. Adds /// data dependencies from SU to any uses of the physical register. void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) { const MachineOperand &MO = SU->getInstr()->getOperand(OperIdx); @@ -313,9 +259,9 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) { } } -/// addPhysRegDeps - Add register dependencies (data, anti, and output) from -/// this SUnit to following instructions in the same scheduling region that -/// depend the physical register referenced at OperIdx. +/// \brief Adds register dependencies (data, anti, and output) from this SUnit +/// to following instructions in the same scheduling region that depend the +/// physical register referenced at OperIdx. void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) { MachineInstr *MI = SU->getInstr(); MachineOperand &MO = MI->getOperand(OperIdx); @@ -406,9 +352,9 @@ LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const return TRI->getSubRegIndexLaneMask(SubReg); } -/// addVRegDefDeps - Add register output and data dependencies from this SUnit -/// to instructions that occur later in the same scheduling region if they read -/// from or write to the virtual register defined at OperIdx. +/// Adds register output and data dependencies from this SUnit to instructions +/// that occur later in the same scheduling region if they read from or write to +/// the virtual register defined at OperIdx. /// /// TODO: Hoist loop induction variable increments. This has to be /// reevaluated. Generally, IV scheduling should be done before coalescing. @@ -515,10 +461,10 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) { CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU)); } -/// addVRegUseDeps - Add a register data dependency if the instruction that -/// defines the virtual register used at OperIdx is mapped to an SUnit. Add a -/// register antidependency from this SUnit to instructions that occur later in -/// the same scheduling region if they write the virtual register. +/// \brief Adds a register data dependency if the instruction that defines the +/// virtual register used at OperIdx is mapped to an SUnit. Add a register +/// antidependency from this SUnit to instructions that occur later in the same +/// scheduling region if they write the virtual register. /// /// TODO: Handle ExitSU "uses" properly. void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) { @@ -545,87 +491,25 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) { } } -/// Return true if MI is an instruction we are unable to reason about +/// Returns true if MI is an instruction we are unable to reason about /// (like a call or something with unmodeled side effects). static inline bool isGlobalMemoryObject(AliasAnalysis *AA, MachineInstr *MI) { return MI->isCall() || MI->hasUnmodeledSideEffects() || (MI->hasOrderedMemoryRef() && !MI->isDereferenceableInvariantLoad(AA)); } -/// This returns true if the two MIs need a chain edge between them. -/// This is called on normal stores and loads. -static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI, - const DataLayout &DL, MachineInstr *MIa, - MachineInstr *MIb) { - const MachineFunction *MF = MIa->getParent()->getParent(); - const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); - - assert ((MIa->mayStore() || MIb->mayStore()) && - "Dependency checked between two loads"); - - // Let the target decide if memory accesses cannot possibly overlap. - if (TII->areMemAccessesTriviallyDisjoint(*MIa, *MIb, AA)) - return false; - - // To this point analysis is generic. From here on we do need AA. - if (!AA) - return true; - - // FIXME: Need to handle multiple memory operands to support all targets. - if (!MIa->hasOneMemOperand() || !MIb->hasOneMemOperand()) - return true; - - MachineMemOperand *MMOa = *MIa->memoperands_begin(); - MachineMemOperand *MMOb = *MIb->memoperands_begin(); - - if (!MMOa->getValue() || !MMOb->getValue()) - return true; - - // The following interface to AA is fashioned after DAGCombiner::isAlias - // and operates with MachineMemOperand offset with some important - // assumptions: - // - LLVM fundamentally assumes flat address spaces. - // - MachineOperand offset can *only* result from legalization and - // cannot affect queries other than the trivial case of overlap - // checking. - // - These offsets never wrap and never step outside - // of allocated objects. - // - There should never be any negative offsets here. - // - // FIXME: Modify API to hide this math from "user" - // FIXME: Even before we go to AA we can reason locally about some - // memory objects. It can save compile time, and possibly catch some - // corner cases not currently covered. - - assert ((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset"); - assert ((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset"); - - int64_t MinOffset = std::min(MMOa->getOffset(), MMOb->getOffset()); - int64_t Overlapa = MMOa->getSize() + MMOa->getOffset() - MinOffset; - int64_t Overlapb = MMOb->getSize() + MMOb->getOffset() - MinOffset; - - AliasResult AAResult = - AA->alias(MemoryLocation(MMOa->getValue(), Overlapa, - UseTBAA ? MMOa->getAAInfo() : AAMDNodes()), - MemoryLocation(MMOb->getValue(), Overlapb, - UseTBAA ? MMOb->getAAInfo() : AAMDNodes())); - - return (AAResult != NoAlias); -} - -/// Check whether two objects need a chain edge and add it if needed. void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb, unsigned Latency) { - if (MIsNeedChainEdge(AAForDep, &MFI, MF.getDataLayout(), SUa->getInstr(), - SUb->getInstr())) { + if (SUa->getInstr()->mayAlias(AAForDep, *SUb->getInstr(), UseTBAA)) { SDep Dep(SUa, SDep::MayAliasMem); Dep.setLatency(Latency); SUb->addPred(Dep); } } -/// Create an SUnit for each real instruction, numbered in top-down topological -/// order. The instruction order A < B, implies that no edge exists from B to A. +/// \brief Creates an SUnit for each real instruction, numbered in top-down +/// topological order. The instruction order A < B, implies that no edge exists +/// from B to A. /// /// Map each real instruction to its SUnit. /// @@ -640,7 +524,7 @@ void ScheduleDAGInstrs::initSUnits() { // which is contained within a basic block. SUnits.reserve(NumRegionInstrs); - for (MachineInstr &MI : llvm::make_range(RegionBegin, RegionEnd)) { + for (MachineInstr &MI : make_range(RegionBegin, RegionEnd)) { if (MI.isDebugValue()) continue; @@ -682,23 +566,22 @@ void ScheduleDAGInstrs::initSUnits() { } class ScheduleDAGInstrs::Value2SUsMap : public MapVector<ValueType, SUList> { - /// Current total number of SUs in map. - unsigned NumNodes; + unsigned NumNodes = 0; /// 1 for loads, 0 for stores. (see comment in SUList) unsigned TrueMemOrderLatency; -public: - Value2SUsMap(unsigned lat = 0) : NumNodes(0), TrueMemOrderLatency(lat) {} +public: + Value2SUsMap(unsigned lat = 0) : TrueMemOrderLatency(lat) {} /// To keep NumNodes up to date, insert() is used instead of /// this operator w/ push_back(). ValueType &operator[](const SUList &Key) { llvm_unreachable("Don't use. Use insert() instead."); }; - /// Add SU to the SUList of V. If Map grows huge, reduce its size - /// by calling reduce(). + /// Adds SU to the SUList of V. If Map grows huge, reduce its size by calling + /// reduce(). void inline insert(SUnit *SU, ValueType V) { MapVector::operator[](V).push_back(SU); NumNodes++; @@ -708,7 +591,7 @@ public: void inline clearList(ValueType V) { iterator Itr = find(V); if (Itr != end()) { - assert (NumNodes >= Itr->second.size()); + assert(NumNodes >= Itr->second.size()); NumNodes -= Itr->second.size(); Itr->second.clear(); @@ -723,8 +606,8 @@ public: unsigned inline size() const { return NumNodes; } - /// Count the number of SUs in this map after a reduction. - void reComputeSize(void) { + /// Counts the number of SUs in this map after a reduction. + void reComputeSize() { NumNodes = 0; for (auto &I : *this) NumNodes += I.second.size(); @@ -754,7 +637,7 @@ void ScheduleDAGInstrs::addChainDependencies(SUnit *SU, } void ScheduleDAGInstrs::addBarrierChain(Value2SUsMap &map) { - assert (BarrierChain != nullptr); + assert(BarrierChain != nullptr); for (auto &I : map) { SUList &sus = I.second; @@ -765,7 +648,7 @@ void ScheduleDAGInstrs::addBarrierChain(Value2SUsMap &map) { } void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) { - assert (BarrierChain != nullptr); + assert(BarrierChain != nullptr); // Go through all lists of SUs. for (Value2SUsMap::iterator I = map.begin(), EE = map.end(); I != EE;) { @@ -797,9 +680,6 @@ void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) { map.reComputeSize(); } -/// If RegPressure is non-null, compute register pressure as a side effect. The -/// DAG builder is an efficient place to do it because it already visits -/// operands. void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, RegPressureTracker *RPTracker, PressureDiffs *PDiffs, @@ -1088,10 +968,6 @@ void ScheduleDAGInstrs::Value2SUsMap::dump() { } } -/// Reduce maps in FIFO order, by N SUs. This is better than turning -/// every Nth memory SU into BarrierChain in buildSchedGraph(), since -/// it avoids unnecessary edges between seen SUs above the new -/// BarrierChain, and those below it. void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores, Value2SUsMap &loads, unsigned N) { DEBUG(dbgs() << "Before reduction:\nStoring SUnits:\n"; @@ -1113,7 +989,7 @@ void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores, // The N last elements in NodeNums will be removed, and the SU with // the lowest NodeNum of them will become the new BarrierChain to // let the not yet seen SUs have a dependency to the removed SUs. - assert (N <= NodeNums.size()); + assert(N <= NodeNums.size()); SUnit *newBarrierChain = &SUnits[*(NodeNums.end() - N)]; if (BarrierChain) { // The aliasing and non-aliasing maps reduce independently of each @@ -1142,183 +1018,77 @@ void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores, loads.dump()); } -/// \brief Initialize register live-range state for updating kills. -void ScheduleDAGInstrs::startBlockForKills(MachineBasicBlock *BB) { - // Start with no live registers. - LiveRegs.reset(); - - // Examine the live-in regs of all successors. - for (const MachineBasicBlock *Succ : BB->successors()) { - for (const auto &LI : Succ->liveins()) { - // Repeat, for reg and all subregs. - for (MCSubRegIterator SubRegs(LI.PhysReg, TRI, /*IncludeSelf=*/true); - SubRegs.isValid(); ++SubRegs) - LiveRegs.set(*SubRegs); - } - } -} - -/// \brief If we change a kill flag on the bundle instruction implicit register -/// operands, then we also need to propagate that to any instructions inside -/// the bundle which had the same kill state. -static void toggleBundleKillFlag(MachineInstr *MI, unsigned Reg, - bool NewKillState, - const TargetRegisterInfo *TRI) { - if (MI->getOpcode() != TargetOpcode::BUNDLE) - return; - - // Walk backwards from the last instruction in the bundle to the first. - // Once we set a kill flag on an instruction, we bail out, as otherwise we - // might set it on too many operands. We will clear as many flags as we - // can though. - MachineBasicBlock::instr_iterator Begin = MI->getIterator(); - MachineBasicBlock::instr_iterator End = getBundleEnd(Begin); - while (Begin != End) { - if (NewKillState) { - if ((--End)->addRegisterKilled(Reg, TRI, /* addIfNotFound= */ false)) - return; - } else - (--End)->clearRegisterKills(Reg, TRI); - } -} - -bool ScheduleDAGInstrs::toggleKillFlag(MachineInstr *MI, MachineOperand &MO) { - // Setting kill flag... - if (!MO.isKill()) { - MO.setIsKill(true); - toggleBundleKillFlag(MI, MO.getReg(), true, TRI); - return false; - } - - // If MO itself is live, clear the kill flag... - if (LiveRegs.test(MO.getReg())) { - MO.setIsKill(false); - toggleBundleKillFlag(MI, MO.getReg(), false, TRI); - return false; - } - - // If any subreg of MO is live, then create an imp-def for that - // subreg and keep MO marked as killed. - MO.setIsKill(false); - toggleBundleKillFlag(MI, MO.getReg(), false, TRI); - bool AllDead = true; - const unsigned SuperReg = MO.getReg(); - MachineInstrBuilder MIB(MF, MI); - for (MCSubRegIterator SubRegs(SuperReg, TRI); SubRegs.isValid(); ++SubRegs) { - if (LiveRegs.test(*SubRegs)) { - MIB.addReg(*SubRegs, RegState::ImplicitDefine); - AllDead = false; - } - } +static void toggleKills(const MachineRegisterInfo &MRI, LivePhysRegs &LiveRegs, + MachineInstr &MI, bool addToLiveRegs) { + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || !MO.readsReg()) + continue; + unsigned Reg = MO.getReg(); + if (!Reg) + continue; - if(AllDead) { - MO.setIsKill(true); - toggleBundleKillFlag(MI, MO.getReg(), true, TRI); + // Things that are available after the instruction are killed by it. + bool IsKill = LiveRegs.available(MRI, Reg); + MO.setIsKill(IsKill); + if (addToLiveRegs) + LiveRegs.addReg(Reg); } - return false; } -// FIXME: Reuse the LivePhysRegs utility for this. -void ScheduleDAGInstrs::fixupKills(MachineBasicBlock *MBB) { - DEBUG(dbgs() << "Fixup kills for BB#" << MBB->getNumber() << '\n'); - - LiveRegs.resize(TRI->getNumRegs()); - BitVector killedRegs(TRI->getNumRegs()); +void ScheduleDAGInstrs::fixupKills(MachineBasicBlock &MBB) { + DEBUG(dbgs() << "Fixup kills for BB#" << MBB.getNumber() << '\n'); - startBlockForKills(MBB); + LiveRegs.init(*TRI); + LiveRegs.addLiveOuts(MBB); // Examine block from end to start... - unsigned Count = MBB->size(); - for (MachineBasicBlock::iterator I = MBB->end(), E = MBB->begin(); - I != E; --Count) { - MachineInstr &MI = *--I; + for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) { if (MI.isDebugValue()) continue; // Update liveness. Registers that are defed but not used in this // instruction are now dead. Mark register and all subregs as they // are completely defined. - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI.getOperand(i); - if (MO.isRegMask()) - LiveRegs.clearBitsNotInMask(MO.getRegMask()); - if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); - if (Reg == 0) continue; - if (!MO.isDef()) continue; - // Ignore two-addr defs. - if (MI.isRegTiedToUseOperand(i)) continue; - - // Repeat for reg and all subregs. - for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true); - SubRegs.isValid(); ++SubRegs) - LiveRegs.reset(*SubRegs); - } - - // Examine all used registers and set/clear kill flag. When a - // register is used multiple times we only set the kill flag on - // the first use. Don't set kill flags on undef operands. - killedRegs.reset(); - - // toggleKillFlag can append new operands (implicit defs), so using - // a range-based loop is not safe. The new operands will be appended - // at the end of the operand list and they don't need to be visited, - // so iterating until the currently last operand is ok. - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI.getOperand(i); - if (!MO.isReg() || !MO.isUse() || MO.isUndef()) continue; - unsigned Reg = MO.getReg(); - if ((Reg == 0) || MRI.isReserved(Reg)) continue; - - bool kill = false; - if (!killedRegs.test(Reg)) { - kill = true; - // A register is not killed if any subregs are live... - for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { - if (LiveRegs.test(*SubRegs)) { - kill = false; - break; - } - } - - // If subreg is not live, then register is killed if it became - // live in this instruction - if (kill) - kill = !LiveRegs.test(Reg); - } - - if (MO.isKill() != kill) { - DEBUG(dbgs() << "Fixing " << MO << " in "); - toggleKillFlag(&MI, MO); - DEBUG(MI.dump()); - DEBUG({ - if (MI.getOpcode() == TargetOpcode::BUNDLE) { - MachineBasicBlock::instr_iterator Begin = MI.getIterator(); - MachineBasicBlock::instr_iterator End = getBundleEnd(Begin); - while (++Begin != End) - DEBUG(Begin->dump()); - } - }); + for (ConstMIBundleOperands O(MI); O.isValid(); ++O) { + const MachineOperand &MO = *O; + if (MO.isReg()) { + if (!MO.isDef()) + continue; + unsigned Reg = MO.getReg(); + if (!Reg) + continue; + LiveRegs.removeReg(Reg); + } else if (MO.isRegMask()) { + LiveRegs.removeRegsInMask(MO); } - - killedRegs.set(Reg); } - // Mark any used register (that is not using undef) and subregs as - // now live... - for (const MachineOperand &MO : MI.operands()) { - if (!MO.isReg() || !MO.isUse() || MO.isUndef()) continue; - unsigned Reg = MO.getReg(); - if ((Reg == 0) || MRI.isReserved(Reg)) continue; - - for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true); - SubRegs.isValid(); ++SubRegs) - LiveRegs.set(*SubRegs); + // If there is a bundle header fix it up first. + if (!MI.isBundled()) { + toggleKills(MRI, LiveRegs, MI, true); + } else { + MachineBasicBlock::instr_iterator First = MI.getIterator(); + if (MI.isBundle()) { + toggleKills(MRI, LiveRegs, MI, false); + ++First; + } + // Some targets make the (questionable) assumtion that the instructions + // inside the bundle are ordered and consequently only the last use of + // a register inside the bundle can kill it. + MachineBasicBlock::instr_iterator I = std::next(First); + while (I->isBundledWithSucc()) + ++I; + do { + if (!I->isDebugValue()) + toggleKills(MRI, LiveRegs, *I, true); + --I; + } while(I != First); } } } void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const { + // Cannot completely remove virtual function even in release mode. #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) SU->getInstr()->dump(); #endif @@ -1347,23 +1117,24 @@ std::string ScheduleDAGInstrs::getDAGName() const { //===----------------------------------------------------------------------===// namespace llvm { -/// \brief Internal state used to compute SchedDFSResult. + +/// Internal state used to compute SchedDFSResult. class SchedDFSImpl { SchedDFSResult &R; /// Join DAG nodes into equivalence classes by their subtree. IntEqClasses SubtreeClasses; /// List PredSU, SuccSU pairs that represent data edges between subtrees. - std::vector<std::pair<const SUnit*, const SUnit*> > ConnectionPairs; + std::vector<std::pair<const SUnit *, const SUnit*>> ConnectionPairs; struct RootData { unsigned NodeID; - unsigned ParentNodeID; // Parent node (member of the parent subtree). - unsigned SubInstrCount; // Instr count in this tree only, not children. + unsigned ParentNodeID; ///< Parent node (member of the parent subtree). + unsigned SubInstrCount = 0; ///< Instr count in this tree only, not + /// children. RootData(unsigned id): NodeID(id), - ParentNodeID(SchedDFSResult::InvalidSubtreeID), - SubInstrCount(0) {} + ParentNodeID(SchedDFSResult::InvalidSubtreeID) {} unsigned getSparseSetIndex() const { return NodeID; } }; @@ -1375,7 +1146,7 @@ public: RootSet.setUniverse(R.DFSNodeData.size()); } - /// Return true if this node been visited by the DFS traversal. + /// Returns true if this node been visited by the DFS traversal. /// /// During visitPostorderNode the Node's SubtreeID is assigned to the Node /// ID. Later, SubtreeID is updated but remains valid. @@ -1384,7 +1155,7 @@ public: != SchedDFSResult::InvalidSubtreeID; } - /// Initialize this node's instruction count. We don't need to flag the node + /// Initializes this node's instruction count. We don't need to flag the node /// visited until visitPostorder because the DAG cannot have cycles. void visitPreorder(const SUnit *SU) { R.DFSNodeData[SU->NodeNum].InstrCount = @@ -1433,8 +1204,8 @@ public: RootSet[SU->NodeNum] = RData; } - /// Called once for each tree edge after calling visitPostOrderNode on the - /// predecessor. Increment the parent node's instruction count and + /// \brief Called once for each tree edge after calling visitPostOrderNode on + /// the predecessor. Increment the parent node's instruction count and /// preemptively join this subtree to its parent's if it is small enough. void visitPostorderEdge(const SDep &PredDep, const SUnit *Succ) { R.DFSNodeData[Succ->NodeNum].InstrCount @@ -1442,13 +1213,13 @@ public: joinPredSubtree(PredDep, Succ); } - /// Add a connection for cross edges. + /// Adds a connection for cross edges. void visitCrossEdge(const SDep &PredDep, const SUnit *Succ) { ConnectionPairs.push_back(std::make_pair(PredDep.getSUnit(), Succ)); } - /// Set each node's subtree ID to the representative ID and record connections - /// between trees. + /// Sets each node's subtree ID to the representative ID and record + /// connections between trees. void finalize() { SubtreeClasses.compress(); R.DFSTreeData.resize(SubtreeClasses.getNumClasses()); @@ -1484,8 +1255,8 @@ public: } protected: - /// Join the predecessor subtree with the successor that is its DFS - /// parent. Apply some heuristics before joining. + /// Joins the predecessor subtree with the successor that is its DFS parent. + /// Applies some heuristics before joining. bool joinPredSubtree(const SDep &PredDep, const SUnit *Succ, bool CheckLimit = true) { assert(PredDep.getKind() == SDep::Data && "Subtrees are for data edges"); @@ -1531,12 +1302,15 @@ protected: } while (FromTree != SchedDFSResult::InvalidSubtreeID); } }; -} // namespace llvm + +} // end namespace llvm namespace { -/// \brief Manage the stack used by a reverse depth-first search over the DAG. + +/// Manage the stack used by a reverse depth-first search over the DAG. class SchedDAGReverseDFS { - std::vector<std::pair<const SUnit*, SUnit::const_pred_iterator> > DFSStack; + std::vector<std::pair<const SUnit *, SUnit::const_pred_iterator>> DFSStack; + public: bool isComplete() const { return DFSStack.empty(); } @@ -1558,7 +1332,8 @@ public: return getCurr()->Preds.end(); } }; -} // anonymous + +} // end anonymous namespace static bool hasDataSucc(const SUnit *SU) { for (const SDep &SuccDep : SU->Succs) { @@ -1569,7 +1344,7 @@ static bool hasDataSucc(const SUnit *SU) { return false; } -/// Compute an ILP metric for all nodes in the subDAG reachable via depth-first +/// Computes an ILP metric for all nodes in the subDAG reachable via depth-first /// search from this root. void SchedDFSResult::compute(ArrayRef<SUnit> SUnits) { if (!IsBottomUp) @@ -1583,7 +1358,7 @@ void SchedDFSResult::compute(ArrayRef<SUnit> SUnits) { SchedDAGReverseDFS DFS; Impl.visitPreorder(&SU); DFS.follow(&SU); - for (;;) { + while (true) { // Traverse the leftmost path as far as possible. while (DFS.getPred() != DFS.getPredEnd()) { const SDep &PredDep = *DFS.getPred(); @@ -1626,8 +1401,8 @@ void SchedDFSResult::scheduleTree(unsigned SubtreeID) { } } -LLVM_DUMP_METHOD -void ILPValue::print(raw_ostream &OS) const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void ILPValue::print(raw_ostream &OS) const { OS << InstrCount << " / " << Length << " = "; if (!Length) OS << "BADILP"; @@ -1635,8 +1410,7 @@ void ILPValue::print(raw_ostream &OS) const { OS << format("%g", ((double)InstrCount / Length)); } -LLVM_DUMP_METHOD -void ILPValue::dump() const { +LLVM_DUMP_METHOD void ILPValue::dump() const { dbgs() << *this << '\n'; } @@ -1648,4 +1422,6 @@ raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) { return OS; } -} // namespace llvm +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp b/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp index ca2881c..bb6a459 100644 --- a/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp +++ b/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp @@ -11,11 +11,11 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/ADT/StringExtras.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/IR/Constants.h" #include "llvm/Support/Debug.h" #include "llvm/Support/GraphWriter.h" diff --git a/contrib/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/contrib/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp index 83bc1ba..b3d83d5 100644 --- a/contrib/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp +++ b/contrib/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp @@ -1,4 +1,4 @@ -//===----- ScoreboardHazardRecognizer.cpp - Scheduler Support -------------===// +//===- ScoreboardHazardRecognizer.cpp - Scheduler Support -----------------===// // // The LLVM Compiler Infrastructure // @@ -15,11 +15,13 @@ #include "llvm/CodeGen/ScoreboardHazardRecognizer.h" #include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrItineraries.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" +#include <cassert> using namespace llvm; @@ -29,8 +31,7 @@ ScoreboardHazardRecognizer::ScoreboardHazardRecognizer( const InstrItineraryData *II, const ScheduleDAG *SchedDAG, const char *ParentDebugType) : ScheduleHazardRecognizer(), DebugType(ParentDebugType), ItinData(II), - DAG(SchedDAG), IssueWidth(0), IssueCount(0) { - + DAG(SchedDAG) { // Determine the maximum depth of any itinerary. This determines the depth of // the scoreboard. We always make the scoreboard at least 1 cycle deep to // avoid dealing with the boundary condition. diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 2c7bffe..432c86d 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -25,6 +25,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" @@ -33,6 +34,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLowering.h" @@ -53,10 +55,6 @@ STATISTIC(SlicedLoads, "Number of load sliced"); namespace { static cl::opt<bool> - CombinerAA("combiner-alias-analysis", cl::Hidden, - cl::desc("Enable DAG combiner alias-analysis heuristics")); - - static cl::opt<bool> CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden, cl::desc("Enable DAG combiner's use of IR alias analysis")); @@ -117,7 +115,7 @@ namespace { SmallPtrSet<SDNode *, 32> CombinedNodes; // AA - Used for DAG load/store alias analysis. - AliasAnalysis &AA; + AliasAnalysis *AA; /// When an instruction is simplified, add all users of the instruction to /// the work lists because they might get more simplified now. @@ -133,6 +131,9 @@ namespace { /// Add to the worklist making sure its instance is at the back (next to be /// processed.) void AddToWorklist(SDNode *N) { + assert(N->getOpcode() != ISD::DELETED_NODE && + "Deleted Node added to Worklist"); + // Skip handle nodes as they can't usefully be combined and confuse the // zero-use deletion strategy. if (N->getOpcode() == ISD::HANDLENODE) @@ -177,6 +178,7 @@ namespace { void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO); private: + unsigned MaximumLegalStoreInBits; /// Check the specified integer node value to see if it can be simplified or /// if things it uses can be simplified by bit propagation. @@ -232,11 +234,18 @@ namespace { SDValue visitTokenFactor(SDNode *N); SDValue visitMERGE_VALUES(SDNode *N); SDValue visitADD(SDNode *N); + SDValue visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference); SDValue visitSUB(SDNode *N); SDValue visitADDC(SDNode *N); + SDValue visitUADDO(SDNode *N); + SDValue visitUADDOLike(SDValue N0, SDValue N1, SDNode *N); SDValue visitSUBC(SDNode *N); + SDValue visitUSUBO(SDNode *N); SDValue visitADDE(SDNode *N); + SDValue visitADDCARRY(SDNode *N); + SDValue visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N); SDValue visitSUBE(SDNode *N); + SDValue visitSUBCARRY(SDNode *N); SDValue visitMUL(SDNode *N); SDValue useDivRem(SDNode *N); SDValue visitSDIV(SDNode *N); @@ -259,6 +268,7 @@ namespace { SDValue visitSRA(SDNode *N); SDValue visitSRL(SDNode *N); SDValue visitRotate(SDNode *N); + SDValue visitABS(SDNode *N); SDValue visitBSWAP(SDNode *N); SDValue visitBITREVERSE(SDNode *N); SDValue visitCTLZ(SDNode *N); @@ -271,9 +281,11 @@ namespace { SDValue visitSELECT_CC(SDNode *N); SDValue visitSETCC(SDNode *N); SDValue visitSETCCE(SDNode *N); + SDValue visitSETCCCARRY(SDNode *N); SDValue visitSIGN_EXTEND(SDNode *N); SDValue visitZERO_EXTEND(SDNode *N); SDValue visitANY_EXTEND(SDNode *N); + SDValue visitAssertZext(SDNode *N); SDValue visitSIGN_EXTEND_INREG(SDNode *N); SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N); SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N); @@ -336,6 +348,7 @@ namespace { SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt); SDValue foldSelectOfConstants(SDNode *N); + SDValue foldBinOpIntoSelect(SDNode *BO); bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS); SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N); SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2); @@ -344,6 +357,8 @@ namespace { bool NotExtCompare = false); SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, ISD::CondCode CC); + SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, + const SDLoc &DL); SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, const SDLoc &DL, bool foldBooleans = true); @@ -361,14 +376,14 @@ namespace { SDValue BuildSDIVPow2(SDNode *N); SDValue BuildUDIV(SDNode *N); SDValue BuildLogBase2(SDValue Op, const SDLoc &DL); - SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags); - SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags); - SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags *Flags); - SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags *Flags, bool Recip); + SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags); + SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags); + SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags); + SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Recip); SDValue buildSqrtNROneConst(SDValue Op, SDValue Est, unsigned Iterations, - SDNodeFlags *Flags, bool Reciprocal); + SDNodeFlags Flags, bool Reciprocal); SDValue buildSqrtNRTwoConst(SDValue Op, SDValue Est, unsigned Iterations, - SDNodeFlags *Flags, bool Reciprocal); + SDNodeFlags Flags, bool Reciprocal); SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1, bool DemandHighBits = true); SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1); @@ -377,6 +392,7 @@ namespace { unsigned PosOpcode, unsigned NegOpcode, const SDLoc &DL); SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL); + SDValue MatchLoadCombine(SDNode *N); SDValue ReduceLoadWidth(SDNode *N); SDValue ReduceLoadOpStoreWidth(SDNode *N); SDValue splitMergedValStore(StoreSDNode *ST); @@ -384,9 +400,11 @@ namespace { SDValue reduceBuildVecExtToExtBuildVec(SDNode *N); SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N); SDValue reduceBuildVecToShuffle(SDNode *N); - SDValue createBuildVecShuffle(SDLoc DL, SDNode *N, ArrayRef<int> VectorMask, - SDValue VecIn1, SDValue VecIn2, - unsigned LeftIdx); + SDValue reduceBuildVecToTrunc(SDNode *N); + SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N, + ArrayRef<int> VectorMask, SDValue VecIn1, + SDValue VecIn2, unsigned LeftIdx); + SDValue matchVSelectOpSizesWithSetCC(SDNode *N); SDValue GetDemandedBits(SDValue V, const APInt &Mask); @@ -416,15 +434,12 @@ namespace { /// Holds a pointer to an LSBaseSDNode as well as information on where it /// is located in a sequence of memory operations connected by a chain. struct MemOpLink { - MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq): - MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { } + MemOpLink(LSBaseSDNode *N, int64_t Offset) + : MemNode(N), OffsetFromBase(Offset) {} // Ptr to the mem node. LSBaseSDNode *MemNode; // Offset from the base ptr. int64_t OffsetFromBase; - // What is the sequence number of this mem node. - // Lowest mem operand in the DAG starts at zero. - unsigned SequenceNum; }; /// This is a helper function for visitMUL to check the profitability @@ -435,12 +450,6 @@ namespace { SDValue &AddNode, SDValue &ConstNode); - /// This is a helper function for MergeStoresOfConstantsOrVecElts. Returns a - /// constant build_vector of the stored constant values in Stores. - SDValue getMergedConstantVectorStore(SelectionDAG &DAG, const SDLoc &SL, - ArrayRef<MemOpLink> Stores, - SmallVectorImpl<SDValue> &Chains, - EVT Ty) const; /// This is a helper function for visitAND and visitZERO_EXTEND. Returns /// true if the (and (load x) c) pattern matches an extload. ExtVT returns @@ -451,34 +460,36 @@ namespace { EVT LoadResultTy, EVT &ExtVT, EVT &LoadedVT, bool &NarrowLoad); + /// Helper function for MergeConsecutiveStores which merges the + /// component store chains. + SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes, + unsigned NumStores); + /// This is a helper function for MergeConsecutiveStores. When the source /// elements of the consecutive stores are all constants or all extracted /// vector elements, try to merge them into one larger store. - /// \return number of stores that were merged into a merged store (always - /// a prefix of \p StoreNode). - bool MergeStoresOfConstantsOrVecElts( - SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores, - bool IsConstantSrc, bool UseVector); + /// \return True if a merged store was created. + bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes, + EVT MemVT, unsigned NumStores, + bool IsConstantSrc, bool UseVector, + bool UseTrunc); /// This is a helper function for MergeConsecutiveStores. /// Stores that may be merged are placed in StoreNodes. - /// Loads that may alias with those stores are placed in AliasLoadNodes. - void getStoreMergeAndAliasCandidates( - StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes, - SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes); + void getStoreMergeCandidates(StoreSDNode *St, + SmallVectorImpl<MemOpLink> &StoreNodes); /// Helper function for MergeConsecutiveStores. Checks if /// Candidate stores have indirect dependency through their /// operands. \return True if safe to merge bool checkMergeStoreCandidatesForDependencies( - SmallVectorImpl<MemOpLink> &StoreNodes); + SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores); /// Merge consecutive store operations into a wide store. /// This optimization uses wide integers or vectors when possible. /// \return number of stores that were merged into a merged store (the /// affected nodes are stored as a prefix in \p StoreNodes). - bool MergeConsecutiveStores(StoreSDNode *N, - SmallVectorImpl<MemOpLink> &StoreNodes); + bool MergeConsecutiveStores(StoreSDNode *N); /// \brief Try to transform a truncation where C is a constant: /// (trunc (and X, C)) -> (and (trunc X), (trunc C)) @@ -489,10 +500,17 @@ namespace { SDValue distributeTruncateThroughAnd(SDNode *N); public: - DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL) + DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL) : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), - OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) { + OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(AA) { ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize(); + + MaximumLegalStoreInBits = 0; + for (MVT VT : MVT::all_valuetypes()) + if (EVT(VT).isSimple() && VT != MVT::Other && + TLI.isTypeLegal(EVT(VT)) && + VT.getSizeInBits() >= MaximumLegalStoreInBits) + MaximumLegalStoreInBits = VT.getSizeInBits(); } /// Runs the dag combiner on all nodes in the work list @@ -607,10 +625,16 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations, switch (Op.getOpcode()) { default: return false; - case ISD::ConstantFP: - // Don't invert constant FP values after legalize. The negated constant - // isn't necessarily legal. - return LegalOperations ? 0 : 1; + case ISD::ConstantFP: { + if (!LegalOperations) + return 1; + + // Don't invert constant FP values after legalization unless the target says + // the negated constant is legal. + EVT VT = Op.getValueType(); + return TLI.isOperationLegal(ISD::ConstantFP, VT) || + TLI.isFPImmLegal(neg(cast<ConstantFPSDNode>(Op)->getValueAPF()), VT); + } case ISD::FADD: // FIXME: determine better conditions for this xform. if (!Options->UnsafeFPMath) return 0; @@ -629,7 +653,8 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations, Depth + 1); case ISD::FSUB: // We can't turn -(A-B) into B-A when we honor signed zeros. - if (!Options->UnsafeFPMath && !Op.getNode()->getFlags()->hasNoSignedZeros()) + if (!Options->NoSignedZerosFPMath && + !Op.getNode()->getFlags().hasNoSignedZeros()) return 0; // fold (fneg (fsub A, B)) -> (fsub B, A) @@ -667,7 +692,7 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG, assert(Depth <= 6 && "GetNegatedExpression doesn't match isNegatibleForFree"); - const SDNodeFlags *Flags = Op.getNode()->getFlags(); + const SDNodeFlags Flags = Op.getNode()->getFlags(); switch (Op.getOpcode()) { default: llvm_unreachable("Unknown code"); @@ -950,8 +975,8 @@ CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) { /// things it uses can be simplified by bit propagation. If so, return true. bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) { TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations); - APInt KnownZero, KnownOne; - if (!TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) + KnownBits Known; + if (!TLI.SimplifyDemandedBits(Op, Demanded, Known, TLO)) return false; // Revisit the node. @@ -1006,13 +1031,13 @@ SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) { switch (Opc) { default: break; case ISD::AssertSext: - return DAG.getNode(ISD::AssertSext, DL, PVT, - SExtPromoteOperand(Op.getOperand(0), PVT), - Op.getOperand(1)); + if (SDValue Op0 = SExtPromoteOperand(Op.getOperand(0), PVT)) + return DAG.getNode(ISD::AssertSext, DL, PVT, Op0, Op.getOperand(1)); + break; case ISD::AssertZext: - return DAG.getNode(ISD::AssertZext, DL, PVT, - ZExtPromoteOperand(Op.getOperand(0), PVT), - Op.getOperand(1)); + if (SDValue Op0 = ZExtPromoteOperand(Op.getOperand(0), PVT)) + return DAG.getNode(ISD::AssertZext, DL, PVT, Op0, Op.getOperand(1)); + break; case ISD::Constant: { unsigned ExtOpc = Op.getValueType().isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; @@ -1079,37 +1104,44 @@ SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) { if (TLI.IsDesirableToPromoteOp(Op, PVT)) { assert(PVT != VT && "Don't know what type to promote to!"); + DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); + bool Replace0 = false; SDValue N0 = Op.getOperand(0); SDValue NN0 = PromoteOperand(N0, PVT, Replace0); - if (!NN0.getNode()) - return SDValue(); bool Replace1 = false; SDValue N1 = Op.getOperand(1); - SDValue NN1; - if (N0 == N1) - NN1 = NN0; - else { - NN1 = PromoteOperand(N1, PVT, Replace1); - if (!NN1.getNode()) - return SDValue(); - } + SDValue NN1 = PromoteOperand(N1, PVT, Replace1); + SDLoc DL(Op); - AddToWorklist(NN0.getNode()); - if (NN1.getNode()) - AddToWorklist(NN1.getNode()); + SDValue RV = + DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, NN0, NN1)); + + // We are always replacing N0/N1's use in N and only need + // additional replacements if there are additional uses. + Replace0 &= !N0->hasOneUse(); + Replace1 &= (N0 != N1) && !N1->hasOneUse(); + + // Combine Op here so it is presreved past replacements. + CombineTo(Op.getNode(), RV); - if (Replace0) + // If operands have a use ordering, make sur we deal with + // predecessor first. + if (Replace0 && Replace1 && N0.getNode()->isPredecessorOf(N1.getNode())) { + std::swap(N0, N1); + std::swap(NN0, NN1); + } + + if (Replace0) { + AddToWorklist(NN0.getNode()); ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode()); - if (Replace1) + } + if (Replace1) { + AddToWorklist(NN1.getNode()); ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode()); - - DEBUG(dbgs() << "\nPromoting "; - Op.getNode()->dump(&DAG)); - SDLoc DL(Op); - return DAG.getNode(ISD::TRUNCATE, DL, VT, - DAG.getNode(Opc, DL, PVT, NN0, NN1)); + } + return Op; } return SDValue(); } @@ -1137,26 +1169,32 @@ SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) { if (TLI.IsDesirableToPromoteOp(Op, PVT)) { assert(PVT != VT && "Don't know what type to promote to!"); + DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); + bool Replace = false; SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); if (Opc == ISD::SRA) - N0 = SExtPromoteOperand(Op.getOperand(0), PVT); + N0 = SExtPromoteOperand(N0, PVT); else if (Opc == ISD::SRL) - N0 = ZExtPromoteOperand(Op.getOperand(0), PVT); + N0 = ZExtPromoteOperand(N0, PVT); else N0 = PromoteOperand(N0, PVT, Replace); + if (!N0.getNode()) return SDValue(); + SDLoc DL(Op); + SDValue RV = + DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, N0, N1)); + AddToWorklist(N0.getNode()); if (Replace) ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode()); - DEBUG(dbgs() << "\nPromoting "; - Op.getNode()->dump(&DAG)); - SDLoc DL(Op); - return DAG.getNode(ISD::TRUNCATE, DL, VT, - DAG.getNode(Opc, DL, PVT, N0, Op.getOperand(1))); + // Deal with Op being deleted. + if (Op && Op.getOpcode() != ISD::DELETED_NODE) + return RV; } return SDValue(); } @@ -1361,8 +1399,7 @@ void DAGCombiner::Run(CombineLevel AtLevel) { else { assert(N->getValueType(0) == RV.getValueType() && N->getNumValues() == 1 && "Type mismatch"); - SDValue OpV = RV; - DAG.ReplaceAllUsesWith(N, &OpV); + DAG.ReplaceAllUsesWith(N, &RV); } // Push the new node and any users onto the worklist @@ -1389,9 +1426,13 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::ADD: return visitADD(N); case ISD::SUB: return visitSUB(N); case ISD::ADDC: return visitADDC(N); + case ISD::UADDO: return visitUADDO(N); case ISD::SUBC: return visitSUBC(N); + case ISD::USUBO: return visitUSUBO(N); case ISD::ADDE: return visitADDE(N); + case ISD::ADDCARRY: return visitADDCARRY(N); case ISD::SUBE: return visitSUBE(N); + case ISD::SUBCARRY: return visitSUBCARRY(N); case ISD::MUL: return visitMUL(N); case ISD::SDIV: return visitSDIV(N); case ISD::UDIV: return visitUDIV(N); @@ -1415,6 +1456,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::SRL: return visitSRL(N); case ISD::ROTR: case ISD::ROTL: return visitRotate(N); + case ISD::ABS: return visitABS(N); case ISD::BSWAP: return visitBSWAP(N); case ISD::BITREVERSE: return visitBITREVERSE(N); case ISD::CTLZ: return visitCTLZ(N); @@ -1427,9 +1469,11 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::SELECT_CC: return visitSELECT_CC(N); case ISD::SETCC: return visitSETCC(N); case ISD::SETCCE: return visitSETCCE(N); + case ISD::SETCCCARRY: return visitSETCCCARRY(N); case ISD::SIGN_EXTEND: return visitSIGN_EXTEND(N); case ISD::ZERO_EXTEND: return visitZERO_EXTEND(N); case ISD::ANY_EXTEND: return visitANY_EXTEND(N); + case ISD::AssertZext: return visitAssertZext(N); case ISD::SIGN_EXTEND_INREG: return visitSIGN_EXTEND_INREG(N); case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N); case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N); @@ -1530,7 +1574,7 @@ SDValue DAGCombiner::combine(SDNode *N) { // If N is a commutative binary node, try commuting it to enable more // sdisel CSE. - if (!RV.getNode() && SelectionDAG::isCommutativeBinOp(N->getOpcode()) && + if (!RV.getNode() && TLI.isCommutativeBinOp(N->getOpcode()) && N->getNumValues() == 1) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -1574,7 +1618,7 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { } SmallVector<SDNode *, 8> TFs; // List of token factors to visit. - SmallVector<SDValue, 8> Ops; // Ops for replacing token factor. + SmallVector<SDValue, 8> Ops; // Ops for replacing token factor. SmallPtrSet<SDNode*, 16> SeenOps; bool Changed = false; // If we should replace this token factor. @@ -1618,26 +1662,108 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { } } - SDValue Result; + // Remove Nodes that are chained to another node in the list. Do so + // by walking up chains breath-first stopping when we've seen + // another operand. In general we must climb to the EntryNode, but we can exit + // early if we find all remaining work is associated with just one operand as + // no further pruning is possible. + + // List of nodes to search through and original Ops from which they originate. + SmallVector<std::pair<SDNode *, unsigned>, 8> Worklist; + SmallVector<unsigned, 8> OpWorkCount; // Count of work for each Op. + SmallPtrSet<SDNode *, 16> SeenChains; + bool DidPruneOps = false; + + unsigned NumLeftToConsider = 0; + for (const SDValue &Op : Ops) { + Worklist.push_back(std::make_pair(Op.getNode(), NumLeftToConsider++)); + OpWorkCount.push_back(1); + } + + auto AddToWorklist = [&](unsigned CurIdx, SDNode *Op, unsigned OpNumber) { + // If this is an Op, we can remove the op from the list. Remark any + // search associated with it as from the current OpNumber. + if (SeenOps.count(Op) != 0) { + Changed = true; + DidPruneOps = true; + unsigned OrigOpNumber = 0; + while (OrigOpNumber < Ops.size() && Ops[OrigOpNumber].getNode() != Op) + OrigOpNumber++; + assert((OrigOpNumber != Ops.size()) && + "expected to find TokenFactor Operand"); + // Re-mark worklist from OrigOpNumber to OpNumber + for (unsigned i = CurIdx + 1; i < Worklist.size(); ++i) { + if (Worklist[i].second == OrigOpNumber) { + Worklist[i].second = OpNumber; + } + } + OpWorkCount[OpNumber] += OpWorkCount[OrigOpNumber]; + OpWorkCount[OrigOpNumber] = 0; + NumLeftToConsider--; + } + // Add if it's a new chain + if (SeenChains.insert(Op).second) { + OpWorkCount[OpNumber]++; + Worklist.push_back(std::make_pair(Op, OpNumber)); + } + }; + + for (unsigned i = 0; i < Worklist.size() && i < 1024; ++i) { + // We need at least be consider at least 2 Ops to prune. + if (NumLeftToConsider <= 1) + break; + auto CurNode = Worklist[i].first; + auto CurOpNumber = Worklist[i].second; + assert((OpWorkCount[CurOpNumber] > 0) && + "Node should not appear in worklist"); + switch (CurNode->getOpcode()) { + case ISD::EntryToken: + // Hitting EntryToken is the only way for the search to terminate without + // hitting + // another operand's search. Prevent us from marking this operand + // considered. + NumLeftToConsider++; + break; + case ISD::TokenFactor: + for (const SDValue &Op : CurNode->op_values()) + AddToWorklist(i, Op.getNode(), CurOpNumber); + break; + case ISD::CopyFromReg: + case ISD::CopyToReg: + AddToWorklist(i, CurNode->getOperand(0).getNode(), CurOpNumber); + break; + default: + if (auto *MemNode = dyn_cast<MemSDNode>(CurNode)) + AddToWorklist(i, MemNode->getChain().getNode(), CurOpNumber); + break; + } + OpWorkCount[CurOpNumber]--; + if (OpWorkCount[CurOpNumber] == 0) + NumLeftToConsider--; + } // If we've changed things around then replace token factor. if (Changed) { + SDValue Result; if (Ops.empty()) { // The entry token is the only possible outcome. Result = DAG.getEntryNode(); } else { - // New and improved token factor. - Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops); + if (DidPruneOps) { + SmallVector<SDValue, 8> PrunedOps; + // + for (const SDValue &Op : Ops) { + if (SeenChains.count(Op.getNode()) == 0) + PrunedOps.push_back(Op); + } + Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, PrunedOps); + } else { + Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops); + } } - - // Add users to worklist if AA is enabled, since it may introduce - // a lot of new chained token factors while removing memory deps. - bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA - : DAG.getSubtarget().useAA(); - return CombineTo(N, Result, UseAA /*add to worklist*/); + return Result; } - - return Result; + return SDValue(); } /// MERGE_VALUES can always be eliminated. @@ -1664,6 +1790,60 @@ static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) { return Const != nullptr && !Const->isOpaque() ? Const : nullptr; } +SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) { + auto BinOpcode = BO->getOpcode(); + assert((BinOpcode == ISD::ADD || BinOpcode == ISD::SUB || + BinOpcode == ISD::MUL || BinOpcode == ISD::SDIV || + BinOpcode == ISD::UDIV || BinOpcode == ISD::SREM || + BinOpcode == ISD::UREM || BinOpcode == ISD::AND || + BinOpcode == ISD::OR || BinOpcode == ISD::XOR || + BinOpcode == ISD::SHL || BinOpcode == ISD::SRL || + BinOpcode == ISD::SRA || BinOpcode == ISD::FADD || + BinOpcode == ISD::FSUB || BinOpcode == ISD::FMUL || + BinOpcode == ISD::FDIV || BinOpcode == ISD::FREM) && + "Unexpected binary operator"); + + // Bail out if any constants are opaque because we can't constant fold those. + SDValue C1 = BO->getOperand(1); + if (!isConstantOrConstantVector(C1, true) && + !isConstantFPBuildVectorOrConstantFP(C1)) + return SDValue(); + + // Don't do this unless the old select is going away. We want to eliminate the + // binary operator, not replace a binop with a select. + // TODO: Handle ISD::SELECT_CC. + SDValue Sel = BO->getOperand(0); + if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) + return SDValue(); + + SDValue CT = Sel.getOperand(1); + if (!isConstantOrConstantVector(CT, true) && + !isConstantFPBuildVectorOrConstantFP(CT)) + return SDValue(); + + SDValue CF = Sel.getOperand(2); + if (!isConstantOrConstantVector(CF, true) && + !isConstantFPBuildVectorOrConstantFP(CF)) + return SDValue(); + + // We have a select-of-constants followed by a binary operator with a + // constant. Eliminate the binop by pulling the constant math into the select. + // Example: add (select Cond, CT, CF), C1 --> select Cond, CT + C1, CF + C1 + EVT VT = Sel.getValueType(); + SDLoc DL(Sel); + SDValue NewCT = DAG.getNode(BinOpcode, DL, VT, CT, C1); + assert((NewCT.isUndef() || isConstantOrConstantVector(NewCT) || + isConstantFPBuildVectorOrConstantFP(NewCT)) && + "Failed to constant fold a binop with constant operands"); + + SDValue NewCF = DAG.getNode(BinOpcode, DL, VT, CF, C1); + assert((NewCF.isUndef() || isConstantOrConstantVector(NewCF) || + isConstantFPBuildVectorOrConstantFP(NewCF)) && + "Failed to constant fold a binop with constant operands"); + + return DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF); +} + SDValue DAGCombiner::visitADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -1702,16 +1882,36 @@ SDValue DAGCombiner::visitADD(SDNode *N) { if (isNullConstant(N1)) return N0; - // fold ((c1-A)+c2) -> (c1+c2)-A if (isConstantOrConstantVector(N1, /* NoOpaque */ true)) { - if (N0.getOpcode() == ISD::SUB) - if (isConstantOrConstantVector(N0.getOperand(0), /* NoOpaque */ true)) { - return DAG.getNode(ISD::SUB, DL, VT, - DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(0)), - N0.getOperand(1)); + // fold ((c1-A)+c2) -> (c1+c2)-A + if (N0.getOpcode() == ISD::SUB && + isConstantOrConstantVector(N0.getOperand(0), /* NoOpaque */ true)) { + // FIXME: Adding 2 constants should be handled by FoldConstantArithmetic. + return DAG.getNode(ISD::SUB, DL, VT, + DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(0)), + N0.getOperand(1)); + } + + // add (sext i1 X), 1 -> zext (not i1 X) + // We don't transform this pattern: + // add (zext i1 X), -1 -> sext (not i1 X) + // because most (?) targets generate better code for the zext form. + if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() && + isOneConstantOrOneSplatConstant(N1)) { + SDValue X = N0.getOperand(0); + if ((!LegalOperations || + (TLI.isOperationLegal(ISD::XOR, X.getValueType()) && + TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) && + X.getScalarValueSizeInBits() == 1) { + SDValue Not = DAG.getNOT(DL, X, X.getValueType()); + return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Not); } + } } + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // reassociate add if (SDValue RADD = ReassociateOps(ISD::ADD, DL, N0, N1)) return RADD; @@ -1771,9 +1971,60 @@ SDValue DAGCombiner::visitADD(SDNode *N) { // fold (a+b) -> (a|b) iff a and b share no bits. if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) && - VT.isInteger() && DAG.haveNoCommonBitsSet(N0, N1)) + DAG.haveNoCommonBitsSet(N0, N1)) return DAG.getNode(ISD::OR, DL, VT, N0, N1); + if (SDValue Combined = visitADDLike(N0, N1, N)) + return Combined; + + if (SDValue Combined = visitADDLike(N1, N0, N)) + return Combined; + + return SDValue(); +} + +static SDValue getAsCarry(const TargetLowering &TLI, SDValue V) { + bool Masked = false; + + // First, peel away TRUNCATE/ZERO_EXTEND/AND nodes due to legalization. + while (true) { + if (V.getOpcode() == ISD::TRUNCATE || V.getOpcode() == ISD::ZERO_EXTEND) { + V = V.getOperand(0); + continue; + } + + if (V.getOpcode() == ISD::AND && isOneConstant(V.getOperand(1))) { + Masked = true; + V = V.getOperand(0); + continue; + } + + break; + } + + // If this is not a carry, return. + if (V.getResNo() != 1) + return SDValue(); + + if (V.getOpcode() != ISD::ADDCARRY && V.getOpcode() != ISD::SUBCARRY && + V.getOpcode() != ISD::UADDO && V.getOpcode() != ISD::USUBO) + return SDValue(); + + // If the result is masked, then no matter what kind of bool it is we can + // return. If it isn't, then we need to make sure the bool type is either 0 or + // 1 and not other values. + if (Masked || + TLI.getBooleanContents(V.getValueType()) == + TargetLoweringBase::ZeroOrOneBooleanContent) + return V; + + return SDValue(); +} + +SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference) { + EVT VT = N0.getValueType(); + SDLoc DL(LocReference); + // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n)) if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB && isNullConstantOrNullSplatConstant(N1.getOperand(0).getOperand(0))) @@ -1781,12 +2032,6 @@ SDValue DAGCombiner::visitADD(SDNode *N) { DAG.getNode(ISD::SHL, DL, VT, N1.getOperand(0).getOperand(1), N1.getOperand(1))); - if (N0.getOpcode() == ISD::SHL && N0.getOperand(0).getOpcode() == ISD::SUB && - isNullConstantOrNullSplatConstant(N0.getOperand(0).getOperand(0))) - return DAG.getNode(ISD::SUB, DL, VT, N1, - DAG.getNode(ISD::SHL, DL, VT, - N0.getOperand(0).getOperand(1), - N0.getOperand(1))); if (N1.getOpcode() == ISD::AND) { SDValue AndOp0 = N1.getOperand(0); @@ -1797,7 +2042,7 @@ SDValue DAGCombiner::visitADD(SDNode *N) { // and similar xforms where the inner op is either ~0 or 0. if (NumSignBits == DestBits && isOneConstantOrOneSplatConstant(N1->getOperand(1))) - return DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), AndOp0); + return DAG.getNode(ISD::SUB, DL, VT, N0, AndOp0); } // add (sext i1), X -> sub X, (zext i1) @@ -1818,6 +2063,18 @@ SDValue DAGCombiner::visitADD(SDNode *N) { } } + // (add X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry) + if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1))) + return DAG.getNode(ISD::ADDCARRY, DL, N1->getVTList(), + N0, N1.getOperand(0), N1.getOperand(2)); + + // (add X, Carry) -> (addcarry X, 0, Carry) + if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) + if (SDValue Carry = getAsCarry(TLI, N1)) + return DAG.getNode(ISD::ADDCARRY, DL, + DAG.getVTList(VT, Carry.getValueType()), N0, + DAG.getConstant(0, DL, VT), Carry); + return SDValue(); } @@ -1825,40 +2082,90 @@ SDValue DAGCombiner::visitADDC(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); + SDLoc DL(N); // If the flag result is dead, turn this into an ADD. if (!N->hasAnyUseOfValue(1)) - return CombineTo(N, DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N1), - DAG.getNode(ISD::CARRY_FALSE, - SDLoc(N), MVT::Glue)); + return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), + DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); // canonicalize constant to RHS. ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); if (N0C && !N1C) - return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N1, N0); + return DAG.getNode(ISD::ADDC, DL, N->getVTList(), N1, N0); // fold (addc x, 0) -> x + no carry out if (isNullConstant(N1)) return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, - SDLoc(N), MVT::Glue)); + DL, MVT::Glue)); + + // If it cannot overflow, transform into an add. + if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) + return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), + DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); + + return SDValue(); +} + +SDValue DAGCombiner::visitUADDO(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + if (VT.isVector()) + return SDValue(); + + EVT CarryVT = N->getValueType(1); + SDLoc DL(N); + + // If the flag result is dead, turn this into an ADD. + if (!N->hasAnyUseOfValue(1)) + return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), + DAG.getUNDEF(CarryVT)); + + // canonicalize constant to RHS. + ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); + ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); + if (N0C && !N1C) + return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N1, N0); - // fold (addc a, b) -> (or a, b), CARRY_FALSE iff a and b share no bits. - APInt LHSZero, LHSOne; - APInt RHSZero, RHSOne; - DAG.computeKnownBits(N0, LHSZero, LHSOne); + // fold (uaddo x, 0) -> x + no carry out + if (isNullConstant(N1)) + return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT)); - if (LHSZero.getBoolValue()) { - DAG.computeKnownBits(N1, RHSZero, RHSOne); + // If it cannot overflow, transform into an add. + if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) + return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), + DAG.getConstant(0, DL, CarryVT)); + + if (SDValue Combined = visitUADDOLike(N0, N1, N)) + return Combined; - // If all possibly-set bits on the LHS are clear on the RHS, return an OR. - // If all possibly-set bits on the RHS are clear on the LHS, return an OR. - if ((RHSZero & ~LHSZero) == ~LHSZero || (LHSZero & ~RHSZero) == ~RHSZero) - return CombineTo(N, DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N1), - DAG.getNode(ISD::CARRY_FALSE, - SDLoc(N), MVT::Glue)); + if (SDValue Combined = visitUADDOLike(N1, N0, N)) + return Combined; + + return SDValue(); +} + +SDValue DAGCombiner::visitUADDOLike(SDValue N0, SDValue N1, SDNode *N) { + auto VT = N0.getValueType(); + + // (uaddo X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry) + // If Y + 1 cannot overflow. + if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1))) { + SDValue Y = N1.getOperand(0); + SDValue One = DAG.getConstant(1, SDLoc(N), Y.getValueType()); + if (DAG.computeOverflowKind(Y, One) == SelectionDAG::OFK_Never) + return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, Y, + N1.getOperand(2)); } + // (uaddo X, Carry) -> (addcarry X, 0, Carry) + if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) + if (SDValue Carry = getAsCarry(TLI, N1)) + return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, + DAG.getConstant(0, SDLoc(N), VT), Carry); + return SDValue(); } @@ -1881,6 +2188,90 @@ SDValue DAGCombiner::visitADDE(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitADDCARRY(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue CarryIn = N->getOperand(2); + SDLoc DL(N); + + // canonicalize constant to RHS + ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); + ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); + if (N0C && !N1C) + return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn); + + // fold (addcarry x, y, false) -> (uaddo x, y) + if (isNullConstant(CarryIn)) + return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1); + + // fold (addcarry 0, 0, X) -> (and (ext/trunc X), 1) and no carry. + if (isNullConstant(N0) && isNullConstant(N1)) { + EVT VT = N0.getValueType(); + EVT CarryVT = CarryIn.getValueType(); + SDValue CarryExt = DAG.getBoolExtOrTrunc(CarryIn, DL, VT, CarryVT); + AddToWorklist(CarryExt.getNode()); + return CombineTo(N, DAG.getNode(ISD::AND, DL, VT, CarryExt, + DAG.getConstant(1, DL, VT)), + DAG.getConstant(0, DL, CarryVT)); + } + + if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N)) + return Combined; + + if (SDValue Combined = visitADDCARRYLike(N1, N0, CarryIn, N)) + return Combined; + + return SDValue(); +} + +SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, + SDNode *N) { + // Iff the flag result is dead: + // (addcarry (add|uaddo X, Y), 0, Carry) -> (addcarry X, Y, Carry) + if ((N0.getOpcode() == ISD::ADD || + (N0.getOpcode() == ISD::UADDO && N0.getResNo() == 0)) && + isNullConstant(N1) && !N->hasAnyUseOfValue(1)) + return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), + N0.getOperand(0), N0.getOperand(1), CarryIn); + + /** + * When one of the addcarry argument is itself a carry, we may be facing + * a diamond carry propagation. In which case we try to transform the DAG + * to ensure linear carry propagation if that is possible. + * + * We are trying to get: + * (addcarry X, 0, (addcarry A, B, Z):Carry) + */ + if (auto Y = getAsCarry(TLI, N1)) { + /** + * (uaddo A, B) + * / \ + * Carry Sum + * | \ + * | (addcarry *, 0, Z) + * | / + * \ Carry + * | / + * (addcarry X, *, *) + */ + if (Y.getOpcode() == ISD::UADDO && + CarryIn.getResNo() == 1 && + CarryIn.getOpcode() == ISD::ADDCARRY && + isNullConstant(CarryIn.getOperand(1)) && + CarryIn.getOperand(0) == Y.getValue(0)) { + auto NewY = DAG.getNode(ISD::ADDCARRY, SDLoc(N), Y->getVTList(), + Y.getOperand(0), Y.getOperand(1), + CarryIn.getOperand(2)); + AddToWorklist(NewY.getNode()); + return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, + DAG.getConstant(0, SDLoc(N), N0.getValueType()), + NewY.getValue(1)); + } + } + + return SDValue(); +} + // Since it may not be valid to emit a fold to zero for vector initializers // check if we can before folding. static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT, @@ -1920,6 +2311,9 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { N1.getNode()); } + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + ConstantSDNode *N1C = getAsNonOpaqueConstant(N1); // fold (sub x, c) -> (add x, -c) @@ -1944,13 +2338,13 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { } // 0 - X --> 0 if the sub is NUW. - if (N->getFlags()->hasNoUnsignedWrap()) + if (N->getFlags().hasNoUnsignedWrap()) return N0; - if (DAG.MaskedValueIsZero(N1, ~APInt::getSignBit(BitWidth))) { + if (DAG.MaskedValueIsZero(N1, ~APInt::getSignMask(BitWidth))) { // N1 is either 0 or the minimum signed value. If the sub is NSW, then // N1 must be 0 because negating the minimum signed value is undefined. - if (N->getFlags()->hasNoSignedWrap()) + if (N->getFlags().hasNoSignedWrap()) return N0; // 0 - X --> X if X is 0 or the minimum signed value. @@ -2066,6 +2460,38 @@ SDValue DAGCombiner::visitSUBC(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitUSUBO(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + if (VT.isVector()) + return SDValue(); + + EVT CarryVT = N->getValueType(1); + SDLoc DL(N); + + // If the flag result is dead, turn this into an SUB. + if (!N->hasAnyUseOfValue(1)) + return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1), + DAG.getUNDEF(CarryVT)); + + // fold (usubo x, x) -> 0 + no borrow + if (N0 == N1) + return CombineTo(N, DAG.getConstant(0, DL, VT), + DAG.getConstant(0, DL, CarryVT)); + + // fold (usubo x, 0) -> x + no borrow + if (isNullConstant(N1)) + return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT)); + + // Canonicalize (usubo -1, x) -> ~x, i.e. (xor x, -1) + no borrow + if (isAllOnesConstant(N0)) + return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0), + DAG.getConstant(0, DL, CarryVT)); + + return SDValue(); +} + SDValue DAGCombiner::visitSUBE(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -2078,6 +2504,18 @@ SDValue DAGCombiner::visitSUBE(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitSUBCARRY(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue CarryIn = N->getOperand(2); + + // fold (subcarry x, y, false) -> (usubo x, y) + if (isNullConstant(CarryIn)) + return DAG.getNode(ISD::USUBO, SDLoc(N), N->getVTList(), N0, N1); + + return SDValue(); +} + SDValue DAGCombiner::visitMUL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -2122,15 +2560,19 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { !DAG.isConstantIntBuildVectorOrConstantInt(N1)) return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0); // fold (mul x, 0) -> 0 - if (N1IsConst && ConstValue1 == 0) + if (N1IsConst && ConstValue1.isNullValue()) return N1; // We require a splat of the entire scalar bit width for non-contiguous // bit patterns. bool IsFullSplat = ConstValue1.getBitWidth() == VT.getScalarSizeInBits(); // fold (mul x, 1) -> x - if (N1IsConst && ConstValue1 == 1 && IsFullSplat) + if (N1IsConst && ConstValue1.isOneValue() && IsFullSplat) return N0; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // fold (mul x, -1) -> 0-x if (N1IsConst && ConstValue1.isAllOnesValue()) { SDLoc DL(N); @@ -2297,6 +2739,23 @@ SDValue DAGCombiner::useDivRem(SDNode *Node) { return combined; } +static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + SDLoc DL(N); + + if (DAG.isUndef(N->getOpcode(), {N0, N1})) + return DAG.getUNDEF(VT); + + // undef / X -> 0 + // undef % X -> 0 + if (N0.isUndef()) + return DAG.getConstant(0, DL, VT); + + return SDValue(); +} + SDValue DAGCombiner::visitSDIV(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -2319,8 +2778,13 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) { return N0; // fold (sdiv X, -1) -> 0-X if (N1C && N1C->isAllOnesValue()) - return DAG.getNode(ISD::SUB, DL, VT, - DAG.getConstant(0, DL, VT), N0); + return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0); + + if (SDValue V = simplifyDivRem(N, DAG)) + return V; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; // If we know the sign bits of both operands are zero, strength reduce to a // udiv instead. Handles (X&15) /s 4 -> X&15 >> 2 @@ -2332,9 +2796,8 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) { // better results in that case. The target-specific lowering should learn how // to handle exact sdivs efficiently. if (N1C && !N1C->isNullValue() && !N1C->isOpaque() && - !cast<BinaryWithFlagsSDNode>(N)->Flags.hasExact() && - (N1C->getAPIntValue().isPowerOf2() || - (-N1C->getAPIntValue()).isPowerOf2())) { + !N->getFlags().hasExact() && (N1C->getAPIntValue().isPowerOf2() || + (-N1C->getAPIntValue()).isPowerOf2())) { // Target-specific implementation of sdiv x, pow2. if (SDValue Res = BuildSDIVPow2(N)) return Res; @@ -2372,7 +2835,7 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) { // If integer divide is expensive and we satisfy the requirements, emit an // alternate sequence. Targets may check function attributes for size/speed // trade-offs. - AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes(); if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr)) if (SDValue Op = BuildSDIV(N)) return Op; @@ -2384,13 +2847,6 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) { if (SDValue DivRem = useDivRem(N)) return DivRem; - // undef / X -> 0 - if (N0.isUndef()) - return DAG.getConstant(0, DL, VT); - // X / undef -> undef - if (N1.isUndef()) - return N1; - return SDValue(); } @@ -2414,6 +2870,12 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) { N0C, N1C)) return Folded; + if (SDValue V = simplifyDivRem(N, DAG)) + return V; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // fold (udiv x, (1 << c)) -> x >>u c if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && DAG.isKnownToBeAPowerOfTwo(N1)) { @@ -2444,7 +2906,7 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) { } // fold (udiv x, c) -> alternate - AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes(); if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr)) if (SDValue Op = BuildUDIV(N)) return Op; @@ -2456,13 +2918,6 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) { if (SDValue DivRem = useDivRem(N)) return DivRem; - // undef / X -> 0 - if (N0.isUndef()) - return DAG.getConstant(0, DL, VT); - // X / undef -> undef - if (N1.isUndef()) - return N1; - return SDValue(); } @@ -2482,32 +2937,35 @@ SDValue DAGCombiner::visitREM(SDNode *N) { if (SDValue Folded = DAG.FoldConstantArithmetic(Opcode, DL, VT, N0C, N1C)) return Folded; + if (SDValue V = simplifyDivRem(N, DAG)) + return V; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + if (isSigned) { // If we know the sign bits of both operands are zero, strength reduce to a // urem instead. Handles (X & 0x0FFFFFFF) %s 16 -> X&15 if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::UREM, DL, VT, N0, N1); } else { - // fold (urem x, pow2) -> (and x, pow2-1) + SDValue NegOne = DAG.getAllOnesConstant(DL, VT); if (DAG.isKnownToBeAPowerOfTwo(N1)) { - APInt NegOne = APInt::getAllOnesValue(VT.getScalarSizeInBits()); - SDValue Add = - DAG.getNode(ISD::ADD, DL, VT, N1, DAG.getConstant(NegOne, DL, VT)); + // fold (urem x, pow2) -> (and x, pow2-1) + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne); AddToWorklist(Add.getNode()); return DAG.getNode(ISD::AND, DL, VT, N0, Add); } - // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) if (N1.getOpcode() == ISD::SHL && DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) { - APInt NegOne = APInt::getAllOnesValue(VT.getScalarSizeInBits()); - SDValue Add = - DAG.getNode(ISD::ADD, DL, VT, N1, DAG.getConstant(NegOne, DL, VT)); + // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne); AddToWorklist(Add.getNode()); return DAG.getNode(ISD::AND, DL, VT, N0, Add); } } - AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes(); // If X/C can be simplified by the division-by-constant logic, lower // X%C to the equivalent of X-X/C*C. @@ -2536,13 +2994,6 @@ SDValue DAGCombiner::visitREM(SDNode *N) { if (SDValue DivRem = useDivRem(N)) return DivRem.getValue(1); - // undef % X -> 0 - if (N0.isUndef()) - return DAG.getConstant(0, DL, VT); - // X % undef -> undef - if (N1.isUndef()) - return N1; - return SDValue(); } @@ -2932,95 +3383,139 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) { return SDValue(); } +/// Try to make (and/or setcc (LL, LR), setcc (RL, RR)) more efficient. +SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, + const SDLoc &DL) { + SDValue LL, LR, RL, RR, N0CC, N1CC; + if (!isSetCCEquivalent(N0, LL, LR, N0CC) || + !isSetCCEquivalent(N1, RL, RR, N1CC)) + return SDValue(); + + assert(N0.getValueType() == N1.getValueType() && + "Unexpected operand types for bitwise logic op"); + assert(LL.getValueType() == LR.getValueType() && + RL.getValueType() == RR.getValueType() && + "Unexpected operand types for setcc"); + + // If we're here post-legalization or the logic op type is not i1, the logic + // op type must match a setcc result type. Also, all folds require new + // operations on the left and right operands, so those types must match. + EVT VT = N0.getValueType(); + EVT OpVT = LL.getValueType(); + if (LegalOperations || VT != MVT::i1) + if (VT != getSetCCResultType(OpVT)) + return SDValue(); + if (OpVT != RL.getValueType()) + return SDValue(); + + ISD::CondCode CC0 = cast<CondCodeSDNode>(N0CC)->get(); + ISD::CondCode CC1 = cast<CondCodeSDNode>(N1CC)->get(); + bool IsInteger = OpVT.isInteger(); + if (LR == RR && CC0 == CC1 && IsInteger) { + bool IsZero = isNullConstantOrNullSplatConstant(LR); + bool IsNeg1 = isAllOnesConstantOrAllOnesSplatConstant(LR); + + // All bits clear? + bool AndEqZero = IsAnd && CC1 == ISD::SETEQ && IsZero; + // All sign bits clear? + bool AndGtNeg1 = IsAnd && CC1 == ISD::SETGT && IsNeg1; + // Any bits set? + bool OrNeZero = !IsAnd && CC1 == ISD::SETNE && IsZero; + // Any sign bits set? + bool OrLtZero = !IsAnd && CC1 == ISD::SETLT && IsZero; + + // (and (seteq X, 0), (seteq Y, 0)) --> (seteq (or X, Y), 0) + // (and (setgt X, -1), (setgt Y, -1)) --> (setgt (or X, Y), -1) + // (or (setne X, 0), (setne Y, 0)) --> (setne (or X, Y), 0) + // (or (setlt X, 0), (setlt Y, 0)) --> (setlt (or X, Y), 0) + if (AndEqZero || AndGtNeg1 || OrNeZero || OrLtZero) { + SDValue Or = DAG.getNode(ISD::OR, SDLoc(N0), OpVT, LL, RL); + AddToWorklist(Or.getNode()); + return DAG.getSetCC(DL, VT, Or, LR, CC1); + } + + // All bits set? + bool AndEqNeg1 = IsAnd && CC1 == ISD::SETEQ && IsNeg1; + // All sign bits set? + bool AndLtZero = IsAnd && CC1 == ISD::SETLT && IsZero; + // Any bits clear? + bool OrNeNeg1 = !IsAnd && CC1 == ISD::SETNE && IsNeg1; + // Any sign bits clear? + bool OrGtNeg1 = !IsAnd && CC1 == ISD::SETGT && IsNeg1; + + // (and (seteq X, -1), (seteq Y, -1)) --> (seteq (and X, Y), -1) + // (and (setlt X, 0), (setlt Y, 0)) --> (setlt (and X, Y), 0) + // (or (setne X, -1), (setne Y, -1)) --> (setne (and X, Y), -1) + // (or (setgt X, -1), (setgt Y -1)) --> (setgt (and X, Y), -1) + if (AndEqNeg1 || AndLtZero || OrNeNeg1 || OrGtNeg1) { + SDValue And = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, LL, RL); + AddToWorklist(And.getNode()); + return DAG.getSetCC(DL, VT, And, LR, CC1); + } + } + + // TODO: What is the 'or' equivalent of this fold? + // (and (setne X, 0), (setne X, -1)) --> (setuge (add X, 1), 2) + if (IsAnd && LL == RL && CC0 == CC1 && IsInteger && CC0 == ISD::SETNE && + ((isNullConstant(LR) && isAllOnesConstant(RR)) || + (isAllOnesConstant(LR) && isNullConstant(RR)))) { + SDValue One = DAG.getConstant(1, DL, OpVT); + SDValue Two = DAG.getConstant(2, DL, OpVT); + SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0), OpVT, LL, One); + AddToWorklist(Add.getNode()); + return DAG.getSetCC(DL, VT, Add, Two, ISD::SETUGE); + } + + // Try more general transforms if the predicates match and the only user of + // the compares is the 'and' or 'or'. + if (IsInteger && TLI.convertSetCCLogicToBitwiseLogic(OpVT) && CC0 == CC1 && + N0.hasOneUse() && N1.hasOneUse()) { + // and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0 + // or (setne A, B), (setne C, D) --> setne (or (xor A, B), (xor C, D)), 0 + if ((IsAnd && CC1 == ISD::SETEQ) || (!IsAnd && CC1 == ISD::SETNE)) { + SDValue XorL = DAG.getNode(ISD::XOR, SDLoc(N0), OpVT, LL, LR); + SDValue XorR = DAG.getNode(ISD::XOR, SDLoc(N1), OpVT, RL, RR); + SDValue Or = DAG.getNode(ISD::OR, DL, OpVT, XorL, XorR); + SDValue Zero = DAG.getConstant(0, DL, OpVT); + return DAG.getSetCC(DL, VT, Or, Zero, CC1); + } + } + + // Canonicalize equivalent operands to LL == RL. + if (LL == RR && LR == RL) { + CC1 = ISD::getSetCCSwappedOperands(CC1); + std::swap(RL, RR); + } + + // (and (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC) + // (or (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC) + if (LL == RL && LR == RR) { + ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, IsInteger) + : ISD::getSetCCOrOperation(CC0, CC1, IsInteger); + if (NewCC != ISD::SETCC_INVALID && + (!LegalOperations || + (TLI.isCondCodeLegal(NewCC, LL.getSimpleValueType()) && + TLI.isOperationLegal(ISD::SETCC, OpVT)))) + return DAG.getSetCC(DL, VT, LL, LR, NewCC); + } + + return SDValue(); +} + /// This contains all DAGCombine rules which reduce two values combined by /// an And operation to a single value. This makes them reusable in the context /// of visitSELECT(). Rules involving constants are not included as /// visitSELECT() already handles those cases. -SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, - SDNode *LocReference) { +SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) { EVT VT = N1.getValueType(); + SDLoc DL(N); // fold (and x, undef) -> 0 if (N0.isUndef() || N1.isUndef()) - return DAG.getConstant(0, SDLoc(LocReference), VT); - // fold (and (setcc x), (setcc y)) -> (setcc (and x, y)) - SDValue LL, LR, RL, RR, CC0, CC1; - if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){ - ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get(); - ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get(); - - if (LR == RR && isa<ConstantSDNode>(LR) && Op0 == Op1 && - LL.getValueType().isInteger()) { - // fold (and (seteq X, 0), (seteq Y, 0)) -> (seteq (or X, Y), 0) - if (isNullConstant(LR) && Op1 == ISD::SETEQ) { - EVT CCVT = getSetCCResultType(LR.getValueType()); - if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { - SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0), - LR.getValueType(), LL, RL); - AddToWorklist(ORNode.getNode()); - return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1); - } - } - if (isAllOnesConstant(LR)) { - // fold (and (seteq X, -1), (seteq Y, -1)) -> (seteq (and X, Y), -1) - if (Op1 == ISD::SETEQ) { - EVT CCVT = getSetCCResultType(LR.getValueType()); - if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { - SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(N0), - LR.getValueType(), LL, RL); - AddToWorklist(ANDNode.getNode()); - return DAG.getSetCC(SDLoc(LocReference), VT, ANDNode, LR, Op1); - } - } - // fold (and (setgt X, -1), (setgt Y, -1)) -> (setgt (or X, Y), -1) - if (Op1 == ISD::SETGT) { - EVT CCVT = getSetCCResultType(LR.getValueType()); - if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { - SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0), - LR.getValueType(), LL, RL); - AddToWorklist(ORNode.getNode()); - return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1); - } - } - } - } - // Simplify (and (setne X, 0), (setne X, -1)) -> (setuge (add X, 1), 2) - if (LL == RL && isa<ConstantSDNode>(LR) && isa<ConstantSDNode>(RR) && - Op0 == Op1 && LL.getValueType().isInteger() && - Op0 == ISD::SETNE && ((isNullConstant(LR) && isAllOnesConstant(RR)) || - (isAllOnesConstant(LR) && isNullConstant(RR)))) { - EVT CCVT = getSetCCResultType(LL.getValueType()); - if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { - SDLoc DL(N0); - SDValue ADDNode = DAG.getNode(ISD::ADD, DL, LL.getValueType(), - LL, DAG.getConstant(1, DL, - LL.getValueType())); - AddToWorklist(ADDNode.getNode()); - return DAG.getSetCC(SDLoc(LocReference), VT, ADDNode, - DAG.getConstant(2, DL, LL.getValueType()), - ISD::SETUGE); - } - } - // canonicalize equivalent to ll == rl - if (LL == RR && LR == RL) { - Op1 = ISD::getSetCCSwappedOperands(Op1); - std::swap(RL, RR); - } - if (LL == RL && LR == RR) { - bool isInteger = LL.getValueType().isInteger(); - ISD::CondCode Result = ISD::getSetCCAndOperation(Op0, Op1, isInteger); - if (Result != ISD::SETCC_INVALID && - (!LegalOperations || - (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) && - TLI.isOperationLegal(ISD::SETCC, LL.getValueType())))) { - EVT CCVT = getSetCCResultType(LL.getValueType()); - if (N0.getValueType() == CCVT || - (!LegalOperations && N0.getValueType() == MVT::i1)) - return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(), - LL, LR, Result); - } - } - } + return DAG.getConstant(0, DL, VT); + + if (SDValue V = foldLogicOfSetCCs(true, N0, N1, DL)) + return V; if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL && VT.getSizeInBits() <= 64) { @@ -3037,13 +3532,13 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) { ADDC |= Mask; if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) { - SDLoc DL(N0); + SDLoc DL0(N0); SDValue NewAdd = - DAG.getNode(ISD::ADD, DL, VT, + DAG.getNode(ISD::ADD, DL0, VT, N0.getOperand(0), DAG.getConstant(ADDC, DL, VT)); CombineTo(N0.getNode(), NewAdd); // Return N so it doesn't get rechecked! - return SDValue(LocReference, 0); + return SDValue(N, 0); } } } @@ -3068,7 +3563,7 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, unsigned MaskBits = AndMask.countTrailingOnes(); EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2); - if (APIntOps::isMask(AndMask) && + if (AndMask.isMask() && // Required bits must not span the two halves of the integer and // must fit in the half size type. (ShiftBits + MaskBits <= Size / 2) && @@ -3108,7 +3603,7 @@ bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, bool &NarrowLoad) { uint32_t ActiveBits = AndC->getAPIntValue().getActiveBits(); - if (ActiveBits == 0 || !APIntOps::isMask(ActiveBits, AndC->getAPIntValue())) + if (ActiveBits == 0 || !AndC->getAPIntValue().isMask(ActiveBits)) return false; ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); @@ -3191,13 +3686,17 @@ SDValue DAGCombiner::visitAND(SDNode *N) { if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnesValue(BitWidth))) return DAG.getConstant(0, SDLoc(N), VT); + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // reassociate and if (SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1)) return RAND; // fold (and (or x, C), D) -> D if (C & D) == D if (N1C && N0.getOpcode() == ISD::OR) if (ConstantSDNode *ORI = isConstOrConstSplat(N0.getOperand(1))) - if ((ORI->getAPIntValue() & N1C->getAPIntValue()) == N1C->getAPIntValue()) + if (N1C->getAPIntValue().isSubsetOf(ORI->getAPIntValue())) return N1; // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits. if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) { @@ -3299,6 +3798,10 @@ SDValue DAGCombiner::visitAND(SDNode *N) { // If the load type was an EXTLOAD, convert to ZEXTLOAD in order to // preserve semantics once we get rid of the AND. SDValue NewLoad(Load, 0); + + // Fold the AND away. NewLoad may get replaced immediately. + CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0); + if (Load->getExtensionType() == ISD::EXTLOAD) { NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD, Load->getValueType(0), SDLoc(Load), @@ -3316,10 +3819,6 @@ SDValue DAGCombiner::visitAND(SDNode *N) { } } - // Fold the AND away, taking care not to fold to the old load node if we - // replaced it. - CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0); - return SDValue(N, 0); // Return N so it doesn't get rechecked! } } @@ -3398,9 +3897,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) { // Note: the SimplifyDemandedBits fold below can make an information-losing // transform, and then we have no way to find this better fold. if (N1C && N1C->isOne() && N0.getOpcode() == ISD::SUB) { - ConstantSDNode *SubLHS = isConstOrConstSplat(N0.getOperand(0)); - SDValue SubRHS = N0.getOperand(1); - if (SubLHS && SubLHS->isNullValue()) { + if (isNullConstantOrNullSplatConstant(N0.getOperand(0))) { + SDValue SubRHS = N0.getOperand(1); if (SubRHS.getOpcode() == ISD::ZERO_EXTEND && SubRHS.getOperand(0).getScalarValueSizeInBits() == 1) return SubRHS; @@ -3412,7 +3910,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) { // fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1) // fold (and (sra)) -> (and (srl)) when possible. - if (!VT.isVector() && SimplifyDemandedBits(SDValue(N, 0))) + if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // fold (zext_inreg (extload x)) -> (zextload x) @@ -3473,7 +3971,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1, EVT VT = N->getValueType(0); if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16) return SDValue(); - if (!TLI.isOperationLegal(ISD::BSWAP, VT)) + if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT)) return SDValue(); // Recognize (and (shl a, 8), 0xff), (and (srl a, 8), 0xff00) @@ -3585,27 +4083,36 @@ static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) { if (Opc != ISD::AND && Opc != ISD::SHL && Opc != ISD::SRL) return false; - ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N.getOperand(1)); + SDValue N0 = N.getOperand(0); + unsigned Opc0 = N0.getOpcode(); + if (Opc0 != ISD::AND && Opc0 != ISD::SHL && Opc0 != ISD::SRL) + return false; + + ConstantSDNode *N1C = nullptr; + // SHL or SRL: look upstream for AND mask operand + if (Opc == ISD::AND) + N1C = dyn_cast<ConstantSDNode>(N.getOperand(1)); + else if (Opc0 == ISD::AND) + N1C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); if (!N1C) return false; - unsigned Num; + unsigned MaskByteOffset; switch (N1C->getZExtValue()) { default: return false; - case 0xFF: Num = 0; break; - case 0xFF00: Num = 1; break; - case 0xFF0000: Num = 2; break; - case 0xFF000000: Num = 3; break; + case 0xFF: MaskByteOffset = 0; break; + case 0xFF00: MaskByteOffset = 1; break; + case 0xFF0000: MaskByteOffset = 2; break; + case 0xFF000000: MaskByteOffset = 3; break; } // Look for (x & 0xff) << 8 as well as ((x << 8) & 0xff00). - SDValue N0 = N.getOperand(0); if (Opc == ISD::AND) { - if (Num == 0 || Num == 2) { + if (MaskByteOffset == 0 || MaskByteOffset == 2) { // (x >> 8) & 0xff // (x >> 8) & 0xff0000 - if (N0.getOpcode() != ISD::SRL) + if (Opc0 != ISD::SRL) return false; ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); if (!C || C->getZExtValue() != 8) @@ -3613,7 +4120,7 @@ static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) { } else { // (x << 8) & 0xff00 // (x << 8) & 0xff000000 - if (N0.getOpcode() != ISD::SHL) + if (Opc0 != ISD::SHL) return false; ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); if (!C || C->getZExtValue() != 8) @@ -3622,7 +4129,7 @@ static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) { } else if (Opc == ISD::SHL) { // (x & 0xff) << 8 // (x & 0xff0000) << 8 - if (Num != 0 && Num != 2) + if (MaskByteOffset != 0 && MaskByteOffset != 2) return false; ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1)); if (!C || C->getZExtValue() != 8) @@ -3630,17 +4137,17 @@ static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) { } else { // Opc == ISD::SRL // (x & 0xff00) >> 8 // (x & 0xff000000) >> 8 - if (Num != 1 && Num != 3) + if (MaskByteOffset != 1 && MaskByteOffset != 3) return false; ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1)); if (!C || C->getZExtValue() != 8) return false; } - if (Parts[Num]) + if (Parts[MaskByteOffset]) return false; - Parts[Num] = N0.getOperand(0).getNode(); + Parts[MaskByteOffset] = N0.getOperand(0).getNode(); return true; } @@ -3657,7 +4164,7 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) { EVT VT = N->getValueType(0); if (VT != MVT::i32) return SDValue(); - if (!TLI.isOperationLegal(ISD::BSWAP, VT)) + if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT)) return SDValue(); // Look for either @@ -3672,18 +4179,16 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) { if (N1.getOpcode() == ISD::OR && N00.getNumOperands() == 2 && N01.getNumOperands() == 2) { // (or (or (and), (and)), (or (and), (and))) - SDValue N000 = N00.getOperand(0); - if (!isBSwapHWordElement(N000, Parts)) + if (!isBSwapHWordElement(N00, Parts)) return SDValue(); - SDValue N001 = N00.getOperand(1); - if (!isBSwapHWordElement(N001, Parts)) + if (!isBSwapHWordElement(N01, Parts)) return SDValue(); - SDValue N010 = N01.getOperand(0); - if (!isBSwapHWordElement(N010, Parts)) + SDValue N10 = N1.getOperand(0); + if (!isBSwapHWordElement(N10, Parts)) return SDValue(); - SDValue N011 = N01.getOperand(1); - if (!isBSwapHWordElement(N011, Parts)) + SDValue N11 = N1.getOperand(1); + if (!isBSwapHWordElement(N11, Parts)) return SDValue(); } else { // (or (or (or (and), (and)), (and)), (and)) @@ -3723,65 +4228,16 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) { /// This contains all DAGCombine rules which reduce two values combined by /// an Or operation to a single value \see visitANDLike(). -SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) { +SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) { EVT VT = N1.getValueType(); + SDLoc DL(N); + // fold (or x, undef) -> -1 - if (!LegalOperations && - (N0.isUndef() || N1.isUndef())) { - EVT EltVT = VT.isVector() ? VT.getVectorElementType() : VT; - return DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), - SDLoc(LocReference), VT); - } - // fold (or (setcc x), (setcc y)) -> (setcc (or x, y)) - SDValue LL, LR, RL, RR, CC0, CC1; - if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){ - ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get(); - ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get(); - - if (LR == RR && Op0 == Op1 && LL.getValueType().isInteger()) { - // fold (or (setne X, 0), (setne Y, 0)) -> (setne (or X, Y), 0) - // fold (or (setlt X, 0), (setlt Y, 0)) -> (setne (or X, Y), 0) - if (isNullConstant(LR) && (Op1 == ISD::SETNE || Op1 == ISD::SETLT)) { - EVT CCVT = getSetCCResultType(LR.getValueType()); - if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { - SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(LR), - LR.getValueType(), LL, RL); - AddToWorklist(ORNode.getNode()); - return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1); - } - } - // fold (or (setne X, -1), (setne Y, -1)) -> (setne (and X, Y), -1) - // fold (or (setgt X, -1), (setgt Y -1)) -> (setgt (and X, Y), -1) - if (isAllOnesConstant(LR) && (Op1 == ISD::SETNE || Op1 == ISD::SETGT)) { - EVT CCVT = getSetCCResultType(LR.getValueType()); - if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { - SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(LR), - LR.getValueType(), LL, RL); - AddToWorklist(ANDNode.getNode()); - return DAG.getSetCC(SDLoc(LocReference), VT, ANDNode, LR, Op1); - } - } - } - // canonicalize equivalent to ll == rl - if (LL == RR && LR == RL) { - Op1 = ISD::getSetCCSwappedOperands(Op1); - std::swap(RL, RR); - } - if (LL == RL && LR == RR) { - bool isInteger = LL.getValueType().isInteger(); - ISD::CondCode Result = ISD::getSetCCOrOperation(Op0, Op1, isInteger); - if (Result != ISD::SETCC_INVALID && - (!LegalOperations || - (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) && - TLI.isOperationLegal(ISD::SETCC, LL.getValueType())))) { - EVT CCVT = getSetCCResultType(LL.getValueType()); - if (N0.getValueType() == CCVT || - (!LegalOperations && N0.getValueType() == MVT::i1)) - return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(), - LL, LR, Result); - } - } - } + if (!LegalOperations && (N0.isUndef() || N1.isUndef())) + return DAG.getAllOnesConstant(DL, VT); + + if (SDValue V = foldLogicOfSetCCs(false, N0, N1, DL)) + return V; // (or (and X, C1), (and Y, C2)) -> (and (or X, Y), C3) if possible. if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND && @@ -3802,7 +4258,6 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) { DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) { SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1.getOperand(0)); - SDLoc DL(LocReference); return DAG.getNode(ISD::AND, DL, VT, X, DAG.getConstant(LHSMask | RHSMask, DL, VT)); } @@ -3818,7 +4273,7 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) { (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) { SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(1), N1.getOperand(1)); - return DAG.getNode(ISD::AND, SDLoc(LocReference), VT, N0.getOperand(0), X); + return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), X); } return SDValue(); @@ -3847,14 +4302,10 @@ SDValue DAGCombiner::visitOR(SDNode *N) { // fold (or x, -1) -> -1, vector edition if (ISD::isBuildVectorAllOnes(N0.getNode())) // do not return N0, because undef node may exist in N0 - return DAG.getConstant( - APInt::getAllOnesValue(N0.getScalarValueSizeInBits()), SDLoc(N), - N0.getValueType()); + return DAG.getAllOnesConstant(SDLoc(N), N0.getValueType()); if (ISD::isBuildVectorAllOnes(N1.getNode())) // do not return N1, because undef node may exist in N1 - return DAG.getConstant( - APInt::getAllOnesValue(N1.getScalarValueSizeInBits()), SDLoc(N), - N1.getValueType()); + return DAG.getAllOnesConstant(SDLoc(N), N1.getValueType()); // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask) // Do this only if the resulting shuffle is legal. @@ -3867,7 +4318,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) { bool ZeroN10 = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode()); bool ZeroN11 = ISD::isBuildVectorAllZeros(N1.getOperand(1).getNode()); // Ensure both shuffles have a zero input. - if ((ZeroN00 || ZeroN01) && (ZeroN10 || ZeroN11)) { + if ((ZeroN00 != ZeroN01) && (ZeroN10 != ZeroN11)) { assert((!ZeroN00 || !ZeroN01) && "Both inputs zero!"); assert((!ZeroN10 || !ZeroN11) && "Both inputs zero!"); const ShuffleVectorSDNode *SV0 = cast<ShuffleVectorSDNode>(N0); @@ -3939,6 +4390,10 @@ SDValue DAGCombiner::visitOR(SDNode *N) { // fold (or x, -1) -> -1 if (isAllOnesConstant(N1)) return N1; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // fold (or x, c) -> c iff (x & ~c) == 0 if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue())) return N1; @@ -3955,20 +4410,22 @@ SDValue DAGCombiner::visitOR(SDNode *N) { // reassociate or if (SDValue ROR = ReassociateOps(ISD::OR, SDLoc(N), N0, N1)) return ROR; + // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2) - // iff (c1 & c2) == 0. - if (N1C && N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && - isa<ConstantSDNode>(N0.getOperand(1))) { - ConstantSDNode *C1 = cast<ConstantSDNode>(N0.getOperand(1)); - if ((C1->getAPIntValue() & N1C->getAPIntValue()) != 0) { - if (SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT, - N1C, C1)) - return DAG.getNode( - ISD::AND, SDLoc(N), VT, - DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1), COR); - return SDValue(); + // iff (c1 & c2) != 0. + if (N1C && N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse()) { + if (ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) { + if (C1->getAPIntValue().intersects(N1C->getAPIntValue())) { + if (SDValue COR = + DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT, N1C, C1)) + return DAG.getNode( + ISD::AND, SDLoc(N), VT, + DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1), COR); + return SDValue(); + } } } + // Simplify: (or (op x...), (op y...)) -> (op (or x, y)) if (N0.getOpcode() == N1.getOpcode()) if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N)) @@ -3978,9 +4435,11 @@ SDValue DAGCombiner::visitOR(SDNode *N) { if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N))) return SDValue(Rot, 0); + if (SDValue Load = MatchLoadCombine(N)) + return Load; + // Simplify the operands using demanded-bits information. - if (!VT.isVector() && - SimplifyDemandedBits(SDValue(N, 0))) + if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); return SDValue(); @@ -4134,6 +4593,20 @@ SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos, return nullptr; } +// if Left + Right == Sum (constant or constant splat vector) +static bool sumMatchConstant(SDValue Left, SDValue Right, unsigned Sum, + SelectionDAG &DAG, const SDLoc &DL) { + EVT ShiftVT = Left.getValueType(); + if (ShiftVT != Right.getValueType()) return false; + + SDValue ShiftSum = DAG.FoldConstantArithmetic(ISD::ADD, DL, ShiftVT, + Left.getNode(), Right.getNode()); + if (!ShiftSum) return false; + + ConstantSDNode *CSum = isConstOrConstSplat(ShiftSum); + return CSum && CSum->getZExtValue() == Sum; +} + // MatchRotate - Handle an 'or' of two operands. If this is one of the many // idioms for rotate, and if the target supports rotation instructions, generate // a rot[lr]. @@ -4179,31 +4652,24 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) { // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1) // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2) - if (isConstOrConstSplat(LHSShiftAmt) && isConstOrConstSplat(RHSShiftAmt)) { - uint64_t LShVal = isConstOrConstSplat(LHSShiftAmt)->getZExtValue(); - uint64_t RShVal = isConstOrConstSplat(RHSShiftAmt)->getZExtValue(); - if ((LShVal + RShVal) != EltSizeInBits) - return nullptr; - + if (sumMatchConstant(LHSShiftAmt, RHSShiftAmt, EltSizeInBits, DAG, DL)) { SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt); // If there is an AND of either shifted operand, apply it to the result. if (LHSMask.getNode() || RHSMask.getNode()) { - APInt AllBits = APInt::getAllOnesValue(EltSizeInBits); - SDValue Mask = DAG.getConstant(AllBits, DL, VT); + SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); + SDValue Mask = AllOnes; if (LHSMask.getNode()) { - APInt RHSBits = APInt::getLowBitsSet(EltSizeInBits, LShVal); + SDValue RHSBits = DAG.getNode(ISD::SRL, DL, VT, AllOnes, RHSShiftAmt); Mask = DAG.getNode(ISD::AND, DL, VT, Mask, - DAG.getNode(ISD::OR, DL, VT, LHSMask, - DAG.getConstant(RHSBits, DL, VT))); + DAG.getNode(ISD::OR, DL, VT, LHSMask, RHSBits)); } if (RHSMask.getNode()) { - APInt LHSBits = APInt::getHighBitsSet(EltSizeInBits, RShVal); + SDValue LHSBits = DAG.getNode(ISD::SHL, DL, VT, AllOnes, LHSShiftAmt); Mask = DAG.getNode(ISD::AND, DL, VT, Mask, - DAG.getNode(ISD::OR, DL, VT, RHSMask, - DAG.getConstant(LHSBits, DL, VT))); + DAG.getNode(ISD::OR, DL, VT, RHSMask, LHSBits)); } Rot = DAG.getNode(ISD::AND, DL, VT, Rot, Mask); @@ -4246,109 +4712,299 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) { } namespace { -/// Helper struct to parse and store a memory address as base + index + offset. -/// We ignore sign extensions when it is safe to do so. -/// The following two expressions are not equivalent. To differentiate we need -/// to store whether there was a sign extension involved in the index -/// computation. -/// (load (i64 add (i64 copyfromreg %c) -/// (i64 signextend (add (i8 load %index) -/// (i8 1)))) -/// vs -/// -/// (load (i64 add (i64 copyfromreg %c) -/// (i64 signextend (i32 add (i32 signextend (i8 load %index)) -/// (i32 1))))) -struct BaseIndexOffset { - SDValue Base; - SDValue Index; - int64_t Offset; - bool IsIndexSignExt; - - BaseIndexOffset() : Offset(0), IsIndexSignExt(false) {} - - BaseIndexOffset(SDValue Base, SDValue Index, int64_t Offset, - bool IsIndexSignExt) : - Base(Base), Index(Index), Offset(Offset), IsIndexSignExt(IsIndexSignExt) {} - - bool equalBaseIndex(const BaseIndexOffset &Other) { - return Other.Base == Base && Other.Index == Index && - Other.IsIndexSignExt == IsIndexSignExt; - } - - /// Parses tree in Ptr for base, index, offset addresses. - static BaseIndexOffset match(SDValue Ptr, SelectionDAG &DAG, - int64_t PartialOffset = 0) { - bool IsIndexSignExt = false; - - // Split up a folded GlobalAddress+Offset into its component parts. - if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Ptr)) - if (GA->getOpcode() == ISD::GlobalAddress && GA->getOffset() != 0) { - return BaseIndexOffset(DAG.getGlobalAddress(GA->getGlobal(), - SDLoc(GA), - GA->getValueType(0), - /*Offset=*/PartialOffset, - /*isTargetGA=*/false, - GA->getTargetFlags()), - SDValue(), - GA->getOffset(), - IsIndexSignExt); - } - - // We only can pattern match BASE + INDEX + OFFSET. If Ptr is not an ADD - // instruction, then it could be just the BASE or everything else we don't - // know how to handle. Just use Ptr as BASE and give up. - if (Ptr->getOpcode() != ISD::ADD) - return BaseIndexOffset(Ptr, SDValue(), PartialOffset, IsIndexSignExt); - - // We know that we have at least an ADD instruction. Try to pattern match - // the simple case of BASE + OFFSET. - if (isa<ConstantSDNode>(Ptr->getOperand(1))) { - int64_t Offset = cast<ConstantSDNode>(Ptr->getOperand(1))->getSExtValue(); - return match(Ptr->getOperand(0), DAG, Offset + PartialOffset); - } - - // Inside a loop the current BASE pointer is calculated using an ADD and a - // MUL instruction. In this case Ptr is the actual BASE pointer. - // (i64 add (i64 %array_ptr) - // (i64 mul (i64 %induction_var) - // (i64 %element_size))) - if (Ptr->getOperand(1)->getOpcode() == ISD::MUL) - return BaseIndexOffset(Ptr, SDValue(), PartialOffset, IsIndexSignExt); - - // Look at Base + Index + Offset cases. - SDValue Base = Ptr->getOperand(0); - SDValue IndexOffset = Ptr->getOperand(1); - - // Skip signextends. - if (IndexOffset->getOpcode() == ISD::SIGN_EXTEND) { - IndexOffset = IndexOffset->getOperand(0); - IsIndexSignExt = true; - } - - // Either the case of Base + Index (no offset) or something else. - if (IndexOffset->getOpcode() != ISD::ADD) - return BaseIndexOffset(Base, IndexOffset, PartialOffset, IsIndexSignExt); - - // Now we have the case of Base + Index + offset. - SDValue Index = IndexOffset->getOperand(0); - SDValue Offset = IndexOffset->getOperand(1); - - if (!isa<ConstantSDNode>(Offset)) - return BaseIndexOffset(Ptr, SDValue(), PartialOffset, IsIndexSignExt); - - // Ignore signextends. - if (Index->getOpcode() == ISD::SIGN_EXTEND) { - Index = Index->getOperand(0); - IsIndexSignExt = true; - } else IsIndexSignExt = false; - - int64_t Off = cast<ConstantSDNode>(Offset)->getSExtValue(); - return BaseIndexOffset(Base, Index, Off + PartialOffset, IsIndexSignExt); +/// Represents known origin of an individual byte in load combine pattern. The +/// value of the byte is either constant zero or comes from memory. +struct ByteProvider { + // For constant zero providers Load is set to nullptr. For memory providers + // Load represents the node which loads the byte from memory. + // ByteOffset is the offset of the byte in the value produced by the load. + LoadSDNode *Load; + unsigned ByteOffset; + + ByteProvider() : Load(nullptr), ByteOffset(0) {} + + static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) { + return ByteProvider(Load, ByteOffset); } + static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); } + + bool isConstantZero() const { return !Load; } + bool isMemory() const { return Load; } + + bool operator==(const ByteProvider &Other) const { + return Other.Load == Load && Other.ByteOffset == ByteOffset; + } + +private: + ByteProvider(LoadSDNode *Load, unsigned ByteOffset) + : Load(Load), ByteOffset(ByteOffset) {} }; + +/// Recursively traverses the expression calculating the origin of the requested +/// byte of the given value. Returns None if the provider can't be calculated. +/// +/// For all the values except the root of the expression verifies that the value +/// has exactly one use and if it's not true return None. This way if the origin +/// of the byte is returned it's guaranteed that the values which contribute to +/// the byte are not used outside of this expression. +/// +/// Because the parts of the expression are not allowed to have more than one +/// use this function iterates over trees, not DAGs. So it never visits the same +/// node more than once. +const Optional<ByteProvider> calculateByteProvider(SDValue Op, unsigned Index, + unsigned Depth, + bool Root = false) { + // Typical i64 by i8 pattern requires recursion up to 8 calls depth + if (Depth == 10) + return None; + + if (!Root && !Op.hasOneUse()) + return None; + + assert(Op.getValueType().isScalarInteger() && "can't handle other types"); + unsigned BitWidth = Op.getValueSizeInBits(); + if (BitWidth % 8 != 0) + return None; + unsigned ByteWidth = BitWidth / 8; + assert(Index < ByteWidth && "invalid index requested"); + (void) ByteWidth; + + switch (Op.getOpcode()) { + case ISD::OR: { + auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1); + if (!LHS) + return None; + auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1); + if (!RHS) + return None; + + if (LHS->isConstantZero()) + return RHS; + if (RHS->isConstantZero()) + return LHS; + return None; + } + case ISD::SHL: { + auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); + if (!ShiftOp) + return None; + + uint64_t BitShift = ShiftOp->getZExtValue(); + if (BitShift % 8 != 0) + return None; + uint64_t ByteShift = BitShift / 8; + + return Index < ByteShift + ? ByteProvider::getConstantZero() + : calculateByteProvider(Op->getOperand(0), Index - ByteShift, + Depth + 1); + } + case ISD::ANY_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: { + SDValue NarrowOp = Op->getOperand(0); + unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits(); + if (NarrowBitWidth % 8 != 0) + return None; + uint64_t NarrowByteWidth = NarrowBitWidth / 8; + + if (Index >= NarrowByteWidth) + return Op.getOpcode() == ISD::ZERO_EXTEND + ? Optional<ByteProvider>(ByteProvider::getConstantZero()) + : None; + return calculateByteProvider(NarrowOp, Index, Depth + 1); + } + case ISD::BSWAP: + return calculateByteProvider(Op->getOperand(0), ByteWidth - Index - 1, + Depth + 1); + case ISD::LOAD: { + auto L = cast<LoadSDNode>(Op.getNode()); + if (L->isVolatile() || L->isIndexed()) + return None; + + unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits(); + if (NarrowBitWidth % 8 != 0) + return None; + uint64_t NarrowByteWidth = NarrowBitWidth / 8; + + if (Index >= NarrowByteWidth) + return L->getExtensionType() == ISD::ZEXTLOAD + ? Optional<ByteProvider>(ByteProvider::getConstantZero()) + : None; + return ByteProvider::getMemory(L, Index); + } + } + + return None; +} } // namespace +/// Match a pattern where a wide type scalar value is loaded by several narrow +/// loads and combined by shifts and ors. Fold it into a single load or a load +/// and a BSWAP if the targets supports it. +/// +/// Assuming little endian target: +/// i8 *a = ... +/// i32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24) +/// => +/// i32 val = *((i32)a) +/// +/// i8 *a = ... +/// i32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3] +/// => +/// i32 val = BSWAP(*((i32)a)) +/// +/// TODO: This rule matches complex patterns with OR node roots and doesn't +/// interact well with the worklist mechanism. When a part of the pattern is +/// updated (e.g. one of the loads) its direct users are put into the worklist, +/// but the root node of the pattern which triggers the load combine is not +/// necessarily a direct user of the changed node. For example, once the address +/// of t28 load is reassociated load combine won't be triggered: +/// t25: i32 = add t4, Constant:i32<2> +/// t26: i64 = sign_extend t25 +/// t27: i64 = add t2, t26 +/// t28: i8,ch = load<LD1[%tmp9]> t0, t27, undef:i64 +/// t29: i32 = zero_extend t28 +/// t32: i32 = shl t29, Constant:i8<8> +/// t33: i32 = or t23, t32 +/// As a possible fix visitLoad can check if the load can be a part of a load +/// combine pattern and add corresponding OR roots to the worklist. +SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { + assert(N->getOpcode() == ISD::OR && + "Can only match load combining against OR nodes"); + + // Handles simple types only + EVT VT = N->getValueType(0); + if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + unsigned ByteWidth = VT.getSizeInBits() / 8; + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + // Before legalize we can introduce too wide illegal loads which will be later + // split into legal sized loads. This enables us to combine i64 load by i8 + // patterns to a couple of i32 loads on 32 bit targets. + if (LegalOperations && !TLI.isOperationLegal(ISD::LOAD, VT)) + return SDValue(); + + std::function<unsigned(unsigned, unsigned)> LittleEndianByteAt = []( + unsigned BW, unsigned i) { return i; }; + std::function<unsigned(unsigned, unsigned)> BigEndianByteAt = []( + unsigned BW, unsigned i) { return BW - i - 1; }; + + bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian(); + auto MemoryByteOffset = [&] (ByteProvider P) { + assert(P.isMemory() && "Must be a memory byte provider"); + unsigned LoadBitWidth = P.Load->getMemoryVT().getSizeInBits(); + assert(LoadBitWidth % 8 == 0 && + "can only analyze providers for individual bytes not bit"); + unsigned LoadByteWidth = LoadBitWidth / 8; + return IsBigEndianTarget + ? BigEndianByteAt(LoadByteWidth, P.ByteOffset) + : LittleEndianByteAt(LoadByteWidth, P.ByteOffset); + }; + + Optional<BaseIndexOffset> Base; + SDValue Chain; + + SmallSet<LoadSDNode *, 8> Loads; + Optional<ByteProvider> FirstByteProvider; + int64_t FirstOffset = INT64_MAX; + + // Check if all the bytes of the OR we are looking at are loaded from the same + // base address. Collect bytes offsets from Base address in ByteOffsets. + SmallVector<int64_t, 4> ByteOffsets(ByteWidth); + for (unsigned i = 0; i < ByteWidth; i++) { + auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*Root=*/true); + if (!P || !P->isMemory()) // All the bytes must be loaded from memory + return SDValue(); + + LoadSDNode *L = P->Load; + assert(L->hasNUsesOfValue(1, 0) && !L->isVolatile() && !L->isIndexed() && + "Must be enforced by calculateByteProvider"); + assert(L->getOffset().isUndef() && "Unindexed load must have undef offset"); + + // All loads must share the same chain + SDValue LChain = L->getChain(); + if (!Chain) + Chain = LChain; + else if (Chain != LChain) + return SDValue(); + + // Loads must share the same base address + BaseIndexOffset Ptr = BaseIndexOffset::match(L->getBasePtr(), DAG); + int64_t ByteOffsetFromBase = 0; + if (!Base) + Base = Ptr; + else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase)) + return SDValue(); + + // Calculate the offset of the current byte from the base address + ByteOffsetFromBase += MemoryByteOffset(*P); + ByteOffsets[i] = ByteOffsetFromBase; + + // Remember the first byte load + if (ByteOffsetFromBase < FirstOffset) { + FirstByteProvider = P; + FirstOffset = ByteOffsetFromBase; + } + + Loads.insert(L); + } + assert(Loads.size() > 0 && "All the bytes of the value must be loaded from " + "memory, so there must be at least one load which produces the value"); + assert(Base && "Base address of the accessed memory location must be set"); + assert(FirstOffset != INT64_MAX && "First byte offset must be set"); + + // Check if the bytes of the OR we are looking at match with either big or + // little endian value load + bool BigEndian = true, LittleEndian = true; + for (unsigned i = 0; i < ByteWidth; i++) { + int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset; + LittleEndian &= CurrentByteOffset == LittleEndianByteAt(ByteWidth, i); + BigEndian &= CurrentByteOffset == BigEndianByteAt(ByteWidth, i); + if (!BigEndian && !LittleEndian) + return SDValue(); + } + assert((BigEndian != LittleEndian) && "should be either or"); + assert(FirstByteProvider && "must be set"); + + // Ensure that the first byte is loaded from zero offset of the first load. + // So the combined value can be loaded from the first load address. + if (MemoryByteOffset(*FirstByteProvider) != 0) + return SDValue(); + LoadSDNode *FirstLoad = FirstByteProvider->Load; + + // The node we are looking at matches with the pattern, check if we can + // replace it with a single load and bswap if needed. + + // If the load needs byte swap check if the target supports it + bool NeedsBswap = IsBigEndianTarget != BigEndian; + + // Before legalize we can introduce illegal bswaps which will be later + // converted to an explicit bswap sequence. This way we end up with a single + // load and byte shuffling instead of several loads and byte shuffling. + if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT)) + return SDValue(); + + // Check that a load of the wide type is both allowed and fast on the target + bool Fast = false; + bool Allowed = TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), + VT, FirstLoad->getAddressSpace(), + FirstLoad->getAlignment(), &Fast); + if (!Allowed || !Fast) + return SDValue(); + + SDValue NewLoad = + DAG.getLoad(VT, SDLoc(N), Chain, FirstLoad->getBasePtr(), + FirstLoad->getPointerInfo(), FirstLoad->getAlignment()); + + // Transfer chain users from old loads to the new load. + for (LoadSDNode *L : Loads) + DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1)); + + return NeedsBswap ? DAG.getNode(ISD::BSWAP, SDLoc(N), VT, NewLoad) : NewLoad; +} + SDValue DAGCombiner::visitXOR(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -4386,6 +5042,10 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { // fold (xor x, 0) -> x if (isNullConstant(N1)) return N0; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // reassociate xor if (SDValue RXOR = ReassociateOps(ISD::XOR, SDLoc(N), N0, N1)) return RXOR; @@ -4403,9 +5063,9 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { default: llvm_unreachable("Unhandled SetCC Equivalent!"); case ISD::SETCC: - return DAG.getSetCC(SDLoc(N), VT, LHS, RHS, NotCC); + return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC); case ISD::SELECT_CC: - return DAG.getSelectCC(SDLoc(N), LHS, RHS, N0.getOperand(2), + return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2), N0.getOperand(3), NotCC); } } @@ -4470,6 +5130,17 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { N01C->getAPIntValue(), DL, VT)); } } + + // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X) + unsigned OpSizeInBits = VT.getScalarSizeInBits(); + if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 && + N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0) && + TLI.isOperationLegalOrCustom(ISD::ABS, VT)) { + if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1))) + if (C->getAPIntValue() == (OpSizeInBits - 1)) + return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0.getOperand(0)); + } + // fold (xor x, x) -> 0 if (N0 == N1) return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations, LegalTypes); @@ -4505,8 +5176,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { return Tmp; // Simplify the expression using non-local knowledge. - if (!VT.isVector() && - SimplifyDemandedBits(SDValue(N, 0))) + if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); return SDValue(); @@ -4613,13 +5283,51 @@ SDValue DAGCombiner::distributeTruncateThroughAnd(SDNode *N) { } SDValue DAGCombiner::visitRotate(SDNode *N) { + SDLoc dl(N); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + unsigned Bitsize = VT.getScalarSizeInBits(); + + // fold (rot x, 0) -> x + if (isNullConstantOrNullSplatConstant(N1)) + return N0; + + // fold (rot x, c) -> (rot x, c % BitSize) + if (ConstantSDNode *Cst = isConstOrConstSplat(N1)) { + if (Cst->getAPIntValue().uge(Bitsize)) { + uint64_t RotAmt = Cst->getAPIntValue().urem(Bitsize); + return DAG.getNode(N->getOpcode(), dl, VT, N0, + DAG.getConstant(RotAmt, dl, N1.getValueType())); + } + } + // fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))). - if (N->getOperand(1).getOpcode() == ISD::TRUNCATE && - N->getOperand(1).getOperand(0).getOpcode() == ISD::AND) { - if (SDValue NewOp1 = - distributeTruncateThroughAnd(N->getOperand(1).getNode())) - return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), - N->getOperand(0), NewOp1); + if (N1.getOpcode() == ISD::TRUNCATE && + N1.getOperand(0).getOpcode() == ISD::AND) { + if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) + return DAG.getNode(N->getOpcode(), dl, VT, N0, NewOp1); + } + + unsigned NextOp = N0.getOpcode(); + // fold (rot* (rot* x, c2), c1) -> (rot* x, c1 +- c2 % bitsize) + if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) { + SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1); + SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)); + if (C1 && C2 && C1->getValueType(0) == C2->getValueType(0)) { + EVT ShiftVT = C1->getValueType(0); + bool SameSide = (N->getOpcode() == NextOp); + unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB; + if (SDValue CombinedShift = + DAG.FoldConstantArithmetic(CombineOp, dl, ShiftVT, C1, C2)) { + SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT); + SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic( + ISD::SREM, dl, ShiftVT, CombinedShift.getNode(), + BitsizeC.getNode()); + return DAG.getNode(N->getOpcode(), dl, VT, N0->getOperand(0), + CombinedShiftNorm); + } + } } return SDValue(); } @@ -4662,7 +5370,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, N0C, N1C); // fold (shl 0, x) -> 0 - if (isNullConstant(N0)) + if (isNullConstantOrNullSplatConstant(N0)) return N0; // fold (shl x, c >= size(x)) -> undef if (N1C && N1C->getAPIntValue().uge(OpSizeInBits)) @@ -4673,6 +5381,10 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { // fold (shl undef, x) -> 0 if (N0.isUndef()) return DAG.getConstant(0, SDLoc(N), VT); + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // if (shl x, c) is known to be zero, return 0 if (DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnesValue(OpSizeInBits))) @@ -4763,7 +5475,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { // fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2 // fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 > C2 if (N1C && (N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA) && - cast<BinaryWithFlagsSDNode>(N0)->Flags.hasExact()) { + N0->getFlags().hasExact()) { if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { uint64_t C1 = N0C1->getZExtValue(); uint64_t C2 = N1C->getZExtValue(); @@ -4788,12 +5500,12 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { APInt Mask = APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - c1); SDValue Shift; if (c2 > c1) { - Mask = Mask.shl(c2 - c1); + Mask <<= c2 - c1; SDLoc DL(N); Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), DAG.getConstant(c2 - c1, DL, N1.getValueType())); } else { - Mask = Mask.lshr(c1 - c2); + Mask.lshrInPlace(c1 - c2); SDLoc DL(N); Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), DAG.getConstant(c1 - c2, DL, N1.getValueType())); @@ -4808,9 +5520,8 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1)) if (N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1) && isConstantOrConstantVector(N1, /* No Opaques */ true)) { - unsigned BitSize = VT.getScalarSizeInBits(); SDLoc DL(N); - SDValue AllBits = DAG.getConstant(APInt::getAllOnesValue(BitSize), DL, VT); + SDValue AllBits = DAG.getAllOnesConstant(DL, VT); SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1); return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask); } @@ -4851,6 +5562,8 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { unsigned OpSizeInBits = VT.getScalarSizeInBits(); // Arithmetic shifting an all-sign-bit value is a no-op. + // fold (sra 0, x) -> 0 + // fold (sra -1, x) -> -1 if (DAG.ComputeNumSignBits(N0) == OpSizeInBits) return N0; @@ -4865,18 +5578,16 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, N0C, N1C); - // fold (sra 0, x) -> 0 - if (isNullConstant(N0)) - return N0; - // fold (sra -1, x) -> -1 - if (isAllOnesConstant(N0)) - return N0; // fold (sra x, c >= size(x)) -> undef if (N1C && N1C->getAPIntValue().uge(OpSizeInBits)) return DAG.getUNDEF(VT); // fold (sra x, 0) -> x if (N1C && N1C->isNullValue()) return N0; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports // sext_inreg. if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) { @@ -5016,7 +5727,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, N0C, N1C); // fold (srl 0, x) -> 0 - if (isNullConstant(N0)) + if (isNullConstantOrNullSplatConstant(N0)) return N0; // fold (srl x, c >= size(x)) -> undef if (N1C && N1C->getAPIntValue().uge(OpSizeInBits)) @@ -5024,6 +5735,10 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { // fold (srl x, 0) -> x if (N1C && N1C->isNullValue()) return N0; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // if (srl x, c) is known to be zero, return 0 if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnesValue(OpSizeInBits))) @@ -5049,24 +5764,24 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { // fold (srl (trunc (srl x, c1)), c2) -> 0 or (trunc (srl x, (add c1, c2))) if (N1C && N0.getOpcode() == ISD::TRUNCATE && - N0.getOperand(0).getOpcode() == ISD::SRL && - isa<ConstantSDNode>(N0.getOperand(0)->getOperand(1))) { - uint64_t c1 = - cast<ConstantSDNode>(N0.getOperand(0)->getOperand(1))->getZExtValue(); - uint64_t c2 = N1C->getZExtValue(); - EVT InnerShiftVT = N0.getOperand(0).getValueType(); - EVT ShiftCountVT = N0.getOperand(0)->getOperand(1).getValueType(); - uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits(); - // This is only valid if the OpSizeInBits + c1 = size of inner shift. - if (c1 + OpSizeInBits == InnerShiftSize) { - SDLoc DL(N0); - if (c1 + c2 >= InnerShiftSize) - return DAG.getConstant(0, DL, VT); - return DAG.getNode(ISD::TRUNCATE, DL, VT, - DAG.getNode(ISD::SRL, DL, InnerShiftVT, - N0.getOperand(0)->getOperand(0), - DAG.getConstant(c1 + c2, DL, - ShiftCountVT))); + N0.getOperand(0).getOpcode() == ISD::SRL) { + if (auto N001C = isConstOrConstSplat(N0.getOperand(0).getOperand(1))) { + uint64_t c1 = N001C->getZExtValue(); + uint64_t c2 = N1C->getZExtValue(); + EVT InnerShiftVT = N0.getOperand(0).getValueType(); + EVT ShiftCountVT = N0.getOperand(0).getOperand(1).getValueType(); + uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits(); + // This is only valid if the OpSizeInBits + c1 = size of inner shift. + if (c1 + OpSizeInBits == InnerShiftSize) { + SDLoc DL(N0); + if (c1 + c2 >= InnerShiftSize) + return DAG.getConstant(0, DL, VT); + return DAG.getNode(ISD::TRUNCATE, DL, VT, + DAG.getNode(ISD::SRL, DL, InnerShiftVT, + N0.getOperand(0).getOperand(0), + DAG.getConstant(c1 + c2, DL, + ShiftCountVT))); + } } } @@ -5074,9 +5789,8 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 && isConstantOrConstantVector(N1, /* NoOpaques */ true)) { SDLoc DL(N); - APInt AllBits = APInt::getAllOnesValue(N0.getScalarValueSizeInBits()); SDValue Mask = - DAG.getNode(ISD::SRL, DL, VT, DAG.getConstant(AllBits, DL, VT), N1); + DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1); AddToWorklist(Mask.getNode()); return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask); } @@ -5097,7 +5811,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { DAG.getConstant(ShiftAmt, DL0, getShiftAmountTy(SmallVT))); AddToWorklist(SmallShift.getNode()); - APInt Mask = APInt::getAllOnesValue(OpSizeInBits).lshr(ShiftAmt); + APInt Mask = APInt::getLowBitsSet(OpSizeInBits, OpSizeInBits - ShiftAmt); SDLoc DL(N); return DAG.getNode(ISD::AND, DL, VT, DAG.getNode(ISD::ANY_EXTEND, DL, VT, SmallShift), @@ -5115,20 +5829,20 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { // fold (srl (ctlz x), "5") -> x iff x has one bit set (the low bit). if (N1C && N0.getOpcode() == ISD::CTLZ && N1C->getAPIntValue() == Log2_32(OpSizeInBits)) { - APInt KnownZero, KnownOne; - DAG.computeKnownBits(N0.getOperand(0), KnownZero, KnownOne); + KnownBits Known; + DAG.computeKnownBits(N0.getOperand(0), Known); // If any of the input bits are KnownOne, then the input couldn't be all // zeros, thus the result of the srl will always be zero. - if (KnownOne.getBoolValue()) return DAG.getConstant(0, SDLoc(N0), VT); + if (Known.One.getBoolValue()) return DAG.getConstant(0, SDLoc(N0), VT); // If all of the bits input the to ctlz node are known to be zero, then // the result of the ctlz is "32" and the result of the shift is one. - APInt UnknownBits = ~KnownZero; + APInt UnknownBits = ~Known.Zero; if (UnknownBits == 0) return DAG.getConstant(1, SDLoc(N0), VT); // Otherwise, check to see if there is exactly one bit input to the ctlz. - if ((UnknownBits & (UnknownBits - 1)) == 0) { + if (UnknownBits.isPowerOf2()) { // Okay, we know that only that the single bit specified by UnknownBits // could be set on input to the CTLZ node. If this bit is set, the SRL // will return 0, if it is clear, it returns 1. Change the CTLZ/SRL pair @@ -5202,6 +5916,22 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitABS(SDNode *N) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + // fold (abs c1) -> c2 + if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) + return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0); + // fold (abs (abs x)) -> (abs x) + if (N0.getOpcode() == ISD::ABS) + return N0; + // fold (abs x) -> x iff not-negative + if (DAG.SignBitIsZero(N0)) + return N0; + return SDValue(); +} + SDValue DAGCombiner::visitBSWAP(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -5217,7 +5947,11 @@ SDValue DAGCombiner::visitBSWAP(SDNode *N) { SDValue DAGCombiner::visitBITREVERSE(SDNode *N) { SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + // fold (bitreverse c1) -> c2 + if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) + return DAG.getNode(ISD::BITREVERSE, SDLoc(N), VT, N0); // fold (bitreverse (bitreverse x)) -> x if (N0.getOpcode() == ISD::BITREVERSE) return N0.getOperand(0); @@ -5311,7 +6045,6 @@ static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS, } } -// TODO: We should handle other cases of selecting between {-1,0,1} here. SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) { SDValue Cond = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -5320,6 +6053,67 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) { EVT CondVT = Cond.getValueType(); SDLoc DL(N); + if (!VT.isInteger()) + return SDValue(); + + auto *C1 = dyn_cast<ConstantSDNode>(N1); + auto *C2 = dyn_cast<ConstantSDNode>(N2); + if (!C1 || !C2) + return SDValue(); + + // Only do this before legalization to avoid conflicting with target-specific + // transforms in the other direction (create a select from a zext/sext). There + // is also a target-independent combine here in DAGCombiner in the other + // direction for (select Cond, -1, 0) when the condition is not i1. + if (CondVT == MVT::i1 && !LegalOperations) { + if (C1->isNullValue() && C2->isOne()) { + // select Cond, 0, 1 --> zext (!Cond) + SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1); + if (VT != MVT::i1) + NotCond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NotCond); + return NotCond; + } + if (C1->isNullValue() && C2->isAllOnesValue()) { + // select Cond, 0, -1 --> sext (!Cond) + SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1); + if (VT != MVT::i1) + NotCond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NotCond); + return NotCond; + } + if (C1->isOne() && C2->isNullValue()) { + // select Cond, 1, 0 --> zext (Cond) + if (VT != MVT::i1) + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); + return Cond; + } + if (C1->isAllOnesValue() && C2->isNullValue()) { + // select Cond, -1, 0 --> sext (Cond) + if (VT != MVT::i1) + Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); + return Cond; + } + + // For any constants that differ by 1, we can transform the select into an + // extend and add. Use a target hook because some targets may prefer to + // transform in the other direction. + if (TLI.convertSelectOfConstantsToMath()) { + if (C1->getAPIntValue() - 1 == C2->getAPIntValue()) { + // select Cond, C1, C1-1 --> add (zext Cond), C1-1 + if (VT != MVT::i1) + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); + return DAG.getNode(ISD::ADD, DL, VT, Cond, N2); + } + if (C1->getAPIntValue() + 1 == C2->getAPIntValue()) { + // select Cond, C1, C1+1 --> add (sext Cond), C1+1 + if (VT != MVT::i1) + Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); + return DAG.getNode(ISD::ADD, DL, VT, Cond, N2); + } + } + + return SDValue(); + } + // fold (select Cond, 0, 1) -> (xor Cond, 1) // We can't do this reliably if integer based booleans have different contents // to floating point based booleans. This is because we can't tell whether we @@ -5329,15 +6123,14 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) { // undiscoverable (or not reasonably discoverable). For example, it could be // in another basic block or it could require searching a complicated // expression. - if (VT.isInteger() && - (CondVT == MVT::i1 || (CondVT.isInteger() && - TLI.getBooleanContents(false, true) == - TargetLowering::ZeroOrOneBooleanContent && - TLI.getBooleanContents(false, false) == - TargetLowering::ZeroOrOneBooleanContent)) && - isNullConstant(N1) && isOneConstant(N2)) { - SDValue NotCond = DAG.getNode(ISD::XOR, DL, CondVT, Cond, - DAG.getConstant(1, DL, CondVT)); + if (CondVT.isInteger() && + TLI.getBooleanContents(false, true) == + TargetLowering::ZeroOrOneBooleanContent && + TLI.getBooleanContents(false, false) == + TargetLowering::ZeroOrOneBooleanContent && + C1->isNullValue() && C2->isOne()) { + SDValue NotCond = + DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getConstant(1, DL, CondVT)); if (VT.bitsEq(CondVT)) return NotCond; return DAG.getZExtOrTrunc(NotCond, DL, VT); @@ -5352,19 +6145,22 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { SDValue N2 = N->getOperand(2); EVT VT = N->getValueType(0); EVT VT0 = N0.getValueType(); + SDLoc DL(N); // fold (select C, X, X) -> X if (N1 == N2) return N1; + if (const ConstantSDNode *N0C = dyn_cast<const ConstantSDNode>(N0)) { // fold (select true, X, Y) -> X // fold (select false, X, Y) -> Y return !N0C->isNullValue() ? N1 : N2; } + // fold (select X, X, Y) -> (or X, Y) // fold (select X, 1, Y) -> (or C, Y) if (VT == VT0 && VT == MVT::i1 && (N0 == N1 || isOneConstant(N1))) - return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N2); + return DAG.getNode(ISD::OR, DL, VT, N0, N2); if (SDValue V = foldSelectOfConstants(N)) return V; @@ -5373,22 +6169,22 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { if (VT == VT0 && VT == MVT::i1 && isNullConstant(N1)) { SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT); AddToWorklist(NOTNode.getNode()); - return DAG.getNode(ISD::AND, SDLoc(N), VT, NOTNode, N2); + return DAG.getNode(ISD::AND, DL, VT, NOTNode, N2); } // fold (select C, X, 1) -> (or (not C), X) if (VT == VT0 && VT == MVT::i1 && isOneConstant(N2)) { SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT); AddToWorklist(NOTNode.getNode()); - return DAG.getNode(ISD::OR, SDLoc(N), VT, NOTNode, N1); + return DAG.getNode(ISD::OR, DL, VT, NOTNode, N1); } // fold (select X, Y, X) -> (and X, Y) // fold (select X, Y, 0) -> (and X, Y) if (VT == VT0 && VT == MVT::i1 && (N0 == N2 || isNullConstant(N2))) - return DAG.getNode(ISD::AND, SDLoc(N), VT, N0, N1); + return DAG.getNode(ISD::AND, DL, VT, N0, N1); // If we can fold this based on the true/false value, do so. if (SimplifySelectOps(N, N1, N2)) - return SDValue(N, 0); // Don't revisit N. + return SDValue(N, 0); // Don't revisit N. if (VT0 == MVT::i1) { // The code in this block deals with the following 2 equivalences: @@ -5399,27 +6195,27 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { // to the right anyway if we find the inner select exists in the DAG anyway // and we always transform to the left side if we know that we can further // optimize the combination of the conditions. - bool normalizeToSequence - = TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT); + bool normalizeToSequence = + TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT); // select (and Cond0, Cond1), X, Y // -> select Cond0, (select Cond1, X, Y), Y if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) { SDValue Cond0 = N0->getOperand(0); SDValue Cond1 = N0->getOperand(1); - SDValue InnerSelect = DAG.getNode(ISD::SELECT, SDLoc(N), - N1.getValueType(), Cond1, N1, N2); + SDValue InnerSelect = + DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2); if (normalizeToSequence || !InnerSelect.use_empty()) - return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Cond0, + return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, InnerSelect, N2); } // select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y) if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) { SDValue Cond0 = N0->getOperand(0); SDValue Cond1 = N0->getOperand(1); - SDValue InnerSelect = DAG.getNode(ISD::SELECT, SDLoc(N), - N1.getValueType(), Cond1, N1, N2); + SDValue InnerSelect = + DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2); if (normalizeToSequence || !InnerSelect.use_empty()) - return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Cond0, N1, + return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, N1, InnerSelect); } @@ -5431,15 +6227,13 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) { // Create the actual and node if we can generate good code for it. if (!normalizeToSequence) { - SDValue And = DAG.getNode(ISD::AND, SDLoc(N), N0.getValueType(), - N0, N1_0); - return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), And, - N1_1, N2); + SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0); + return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), And, N1_1, N2); } // Otherwise see if we can optimize the "and" to a better pattern. if (SDValue Combined = visitANDLike(N0, N1_0, N)) - return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Combined, - N1_1, N2); + return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1_1, + N2); } } // select Cond0, X, (select Cond1, X, Y) -> select (or Cond0, Cond1), X, Y @@ -5450,15 +6244,13 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) { // Create the actual or node if we can generate good code for it. if (!normalizeToSequence) { - SDValue Or = DAG.getNode(ISD::OR, SDLoc(N), N0.getValueType(), - N0, N2_0); - return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Or, - N1, N2_2); + SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0); + return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Or, N1, N2_2); } // Otherwise see if we can optimize to a better pattern. if (SDValue Combined = visitORLike(N0, N2_0, N)) - return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Combined, - N1, N2_2); + return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1, + N2_2); } } } @@ -5469,8 +6261,7 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { if (auto *C = dyn_cast<ConstantSDNode>(N0->getOperand(1))) { SDValue Cond0 = N0->getOperand(0); if (C->isOne()) - return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), - Cond0, N2, N1); + return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, N2, N1); } } } @@ -5487,24 +6278,21 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { // FIXME: Instead of testing for UnsafeFPMath, this should be checking for // no signed zeros as well as no nans. const TargetOptions &Options = DAG.getTarget().Options; - if (Options.UnsafeFPMath && - VT.isFloatingPoint() && N0.hasOneUse() && + if (Options.UnsafeFPMath && VT.isFloatingPoint() && N0.hasOneUse() && DAG.isKnownNeverNaN(N1) && DAG.isKnownNeverNaN(N2)) { ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); - if (SDValue FMinMax = combineMinNumMaxNum(SDLoc(N), VT, N0.getOperand(0), - N0.getOperand(1), N1, N2, CC, - TLI, DAG)) + if (SDValue FMinMax = combineMinNumMaxNum( + DL, VT, N0.getOperand(0), N0.getOperand(1), N1, N2, CC, TLI, DAG)) return FMinMax; } if ((!LegalOperations && TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT)) || TLI.isOperationLegal(ISD::SELECT_CC, VT)) - return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT, - N0.getOperand(0), N0.getOperand(1), - N1, N2, N0.getOperand(2)); - return SimplifySelect(SDLoc(N), N0, N1, N2); + return DAG.getNode(ISD::SELECT_CC, DL, VT, N0.getOperand(0), + N0.getOperand(1), N1, N2, N0.getOperand(2)); + return SimplifySelect(DL, N0, N1, N2); } return SDValue(); @@ -5847,7 +6635,7 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) { ISD::NON_EXTLOAD, MLD->isExpandingLoad()); Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG, - MLD->isExpandingLoad()); + MLD->isExpandingLoad()); MMO = DAG.getMachineFunction(). getMachineMemOperand(MLD->getPointerInfo(), @@ -5908,6 +6696,9 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { if (isAbs) { EVT VT = LHS.getValueType(); + if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) + return DAG.getNode(ISD::ABS, DL, VT, LHS); + SDValue Shift = DAG.getNode( ISD::SRA, DL, VT, LHS, DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT)); @@ -5921,34 +6712,6 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { if (SimplifySelectOps(N, N1, N2)) return SDValue(N, 0); // Don't revisit N. - // If the VSELECT result requires splitting and the mask is provided by a - // SETCC, then split both nodes and its operands before legalization. This - // prevents the type legalizer from unrolling SETCC into scalar comparisons - // and enables future optimizations (e.g. min/max pattern matching on X86). - if (N0.getOpcode() == ISD::SETCC) { - EVT VT = N->getValueType(0); - - // Check if any splitting is required. - if (TLI.getTypeAction(*DAG.getContext(), VT) != - TargetLowering::TypeSplitVector) - return SDValue(); - - SDValue Lo, Hi, CCLo, CCHi, LL, LH, RL, RH; - std::tie(CCLo, CCHi) = SplitVSETCC(N0.getNode(), DAG); - std::tie(LL, LH) = DAG.SplitVectorOperand(N, 1); - std::tie(RL, RH) = DAG.SplitVectorOperand(N, 2); - - Lo = DAG.getNode(N->getOpcode(), DL, LL.getValueType(), CCLo, LL, RL); - Hi = DAG.getNode(N->getOpcode(), DL, LH.getValueType(), CCHi, LH, RH); - - // Add the new VSELECT nodes to the work list in case they need to be split - // again. - AddToWorklist(Lo.getNode()); - AddToWorklist(Hi.getNode()); - - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); - } - // Fold (vselect (build_vector all_ones), N1, N2) -> N1 if (ISD::isBuildVectorAllOnes(N0.getNode())) return N1; @@ -6030,6 +6793,19 @@ SDValue DAGCombiner::visitSETCCE(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitSETCCCARRY(SDNode *N) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue Carry = N->getOperand(2); + SDValue Cond = N->getOperand(3); + + // If Carry is false, fold to a regular SETCC. + if (isNullConstant(Carry)) + return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond); + + return SDValue(); +} + /// Try to fold a sext/zext/aext dag node into a ConstantSDNode or /// a build_vector of constants. /// This function is called by the DAGCombiner when visiting sext/zext/aext @@ -6258,6 +7034,9 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) { SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads); + // Simplify TF. + AddToWorklist(NewChain.getNode()); + CombineTo(N, NewValue); // Replace uses of the original load (before extension) @@ -6270,9 +7049,55 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) { return SDValue(N, 0); // Return N so it doesn't get rechecked! } +/// If we're narrowing or widening the result of a vector select and the final +/// size is the same size as a setcc (compare) feeding the select, then try to +/// apply the cast operation to the select's operands because matching vector +/// sizes for a select condition and other operands should be more efficient. +SDValue DAGCombiner::matchVSelectOpSizesWithSetCC(SDNode *Cast) { + unsigned CastOpcode = Cast->getOpcode(); + assert((CastOpcode == ISD::SIGN_EXTEND || CastOpcode == ISD::ZERO_EXTEND || + CastOpcode == ISD::TRUNCATE || CastOpcode == ISD::FP_EXTEND || + CastOpcode == ISD::FP_ROUND) && + "Unexpected opcode for vector select narrowing/widening"); + + // We only do this transform before legal ops because the pattern may be + // obfuscated by target-specific operations after legalization. Do not create + // an illegal select op, however, because that may be difficult to lower. + EVT VT = Cast->getValueType(0); + if (LegalOperations || !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) + return SDValue(); + + SDValue VSel = Cast->getOperand(0); + if (VSel.getOpcode() != ISD::VSELECT || !VSel.hasOneUse() || + VSel.getOperand(0).getOpcode() != ISD::SETCC) + return SDValue(); + + // Does the setcc have the same vector size as the casted select? + SDValue SetCC = VSel.getOperand(0); + EVT SetCCVT = getSetCCResultType(SetCC.getOperand(0).getValueType()); + if (SetCCVT.getSizeInBits() != VT.getSizeInBits()) + return SDValue(); + + // cast (vsel (setcc X), A, B) --> vsel (setcc X), (cast A), (cast B) + SDValue A = VSel.getOperand(1); + SDValue B = VSel.getOperand(2); + SDValue CastA, CastB; + SDLoc DL(Cast); + if (CastOpcode == ISD::FP_ROUND) { + // FP_ROUND (fptrunc) has an extra flag operand to pass along. + CastA = DAG.getNode(CastOpcode, DL, VT, A, Cast->getOperand(1)); + CastB = DAG.getNode(CastOpcode, DL, VT, B, Cast->getOperand(1)); + } else { + CastA = DAG.getNode(CastOpcode, DL, VT, A); + CastB = DAG.getNode(CastOpcode, DL, VT, B); + } + return DAG.getNode(ISD::VSELECT, DL, VT, SetCC, CastA, CastB); +} + SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + SDLoc DL(N); if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes, LegalOperations)) @@ -6281,8 +7106,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { // fold (sext (sext x)) -> (sext x) // fold (sext (aext x)) -> (sext x) if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) - return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, - N0.getOperand(0)); + return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0)); if (N0.getOpcode() == ISD::TRUNCATE) { // fold (sext (truncate (load x))) -> (sext (smaller load x)) @@ -6314,12 +7138,12 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { // Op is i32, Mid is i8, and Dest is i64. If Op has more than 24 sign // bits, just sext from i32. if (NumSignBits > OpBits-MidBits) - return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, Op); + return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op); } else { // Op is i64, Mid is i8, and Dest is i32. If Op has more than 56 sign // bits, just truncate to i32. if (NumSignBits > OpBits-MidBits) - return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op); + return DAG.getNode(ISD::TRUNCATE, DL, VT, Op); } // fold (sext (truncate x)) -> (sextinreg x). @@ -6329,7 +7153,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N0), VT, Op); else if (OpBits > DestBits) Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, Op); - return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, Op, + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op, DAG.getValueType(N0.getValueType())); } } @@ -6349,17 +7173,20 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0)); if (DoXform) { LoadSDNode *LN0 = cast<LoadSDNode>(N0); - SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, - LN0->getChain(), + SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); - CombineTo(N, ExtLoad); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); - CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1)); - ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), - ISD::SIGN_EXTEND); - return SDValue(N, 0); // Return N so it doesn't get rechecked! + ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND); + // If the load value is used only by N, replace it via CombineTo N. + bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse(); + CombineTo(N, ExtLoad); + if (NoReplaceTrunc) + DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); + else + CombineTo(LN0, Trunc, ExtLoad.getValue(1)); + return SDValue(N, 0); } } @@ -6376,8 +7203,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { EVT MemVT = LN0->getMemoryVT(); if ((!LegalOperations && !LN0->isVolatile()) || TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT)) { - SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, - LN0->getChain(), + SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); CombineTo(N, ExtLoad); @@ -6411,32 +7237,38 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { LN0->getMemOperand()); APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); Mask = Mask.sext(VT.getSizeInBits()); - SDLoc DL(N); SDValue And = DAG.getNode(N0.getOpcode(), DL, VT, ExtLoad, DAG.getConstant(Mask, DL, VT)); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0.getOperand(0)), N0.getOperand(0).getValueType(), ExtLoad); + ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND); + bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse(); CombineTo(N, And); - CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1)); - ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, - ISD::SIGN_EXTEND); - return SDValue(N, 0); // Return N so it doesn't get rechecked! + if (NoReplaceTrunc) + DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); + else + CombineTo(LN0, Trunc, ExtLoad.getValue(1)); + return SDValue(N,0); // Return N so it doesn't get rechecked! } } } if (N0.getOpcode() == ISD::SETCC) { - EVT N0VT = N0.getOperand(0).getValueType(); + SDValue N00 = N0.getOperand(0); + SDValue N01 = N0.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); + EVT N00VT = N0.getOperand(0).getValueType(); + // sext(setcc) -> sext_in_reg(vsetcc) for vectors. // Only do this before legalize for now. if (VT.isVector() && !LegalOperations && - TLI.getBooleanContents(N0VT) == + TLI.getBooleanContents(N00VT) == TargetLowering::ZeroOrNegativeOneBooleanContent) { // On some architectures (such as SSE/NEON/etc) the SETCC result type is // of the same size as the compared operands. Only optimize sext(setcc()) // if this is the case. - EVT SVT = getSetCCResultType(N0VT); + EVT SVT = getSetCCResultType(N00VT); // We know that the # elements of the results is the same as the // # elements of the compare (and the # elements of the compare result @@ -6444,19 +7276,15 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { // we know that the element size of the sext'd result matches the // element size of the compare operands. if (VT.getSizeInBits() == SVT.getSizeInBits()) - return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0), - N0.getOperand(1), - cast<CondCodeSDNode>(N0.getOperand(2))->get()); + return DAG.getSetCC(DL, VT, N00, N01, CC); // If the desired elements are smaller or larger than the source - // elements we can use a matching integer vector type and then - // truncate/sign extend - EVT MatchingVectorType = N0VT.changeVectorElementTypeToInteger(); - if (SVT == MatchingVectorType) { - SDValue VsetCC = DAG.getSetCC(SDLoc(N), MatchingVectorType, - N0.getOperand(0), N0.getOperand(1), - cast<CondCodeSDNode>(N0.getOperand(2))->get()); - return DAG.getSExtOrTrunc(VsetCC, SDLoc(N), VT); + // elements, we can use a matching integer vector type and then + // truncate/sign extend. + EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger(); + if (SVT == MatchingVecType) { + SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC); + return DAG.getSExtOrTrunc(VsetCC, DL, VT); } } @@ -6465,36 +7293,30 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { // getBooleanContents(). unsigned SetCCWidth = N0.getScalarValueSizeInBits(); - SDLoc DL(N); // To determine the "true" side of the select, we need to know the high bit // of the value returned by the setcc if it evaluates to true. // If the type of the setcc is i1, then the true case of the select is just // sext(i1 1), that is, -1. // If the type of the setcc is larger (say, i8) then the value of the high - // bit depends on getBooleanContents(). So, ask TLI for a real "true" value + // bit depends on getBooleanContents(), so ask TLI for a real "true" value // of the appropriate width. - SDValue ExtTrueVal = - (SetCCWidth == 1) - ? DAG.getConstant(APInt::getAllOnesValue(VT.getScalarSizeInBits()), - DL, VT) - : TLI.getConstTrueVal(DAG, VT, DL); - - if (SDValue SCC = SimplifySelectCC( - DL, N0.getOperand(0), N0.getOperand(1), ExtTrueVal, - DAG.getConstant(0, DL, VT), - cast<CondCodeSDNode>(N0.getOperand(2))->get(), true)) + SDValue ExtTrueVal = (SetCCWidth == 1) ? DAG.getAllOnesConstant(DL, VT) + : TLI.getConstTrueVal(DAG, VT, DL); + SDValue Zero = DAG.getConstant(0, DL, VT); + if (SDValue SCC = + SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true)) return SCC; if (!VT.isVector()) { - EVT SetCCVT = getSetCCResultType(N0.getOperand(0).getValueType()); - if (!LegalOperations || - TLI.isOperationLegal(ISD::SETCC, N0.getOperand(0).getValueType())) { - SDLoc DL(N); - ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); - SDValue SetCC = - DAG.getSetCC(DL, SetCCVT, N0.getOperand(0), N0.getOperand(1), CC); - return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, - DAG.getConstant(0, DL, VT)); + EVT SetCCVT = getSetCCResultType(N00VT); + // Don't do this transform for i1 because there's a select transform + // that would reverse it. + // TODO: We should not do this transform at all without a target hook + // because a sext is likely cheaper than a select? + if (SetCCVT.getScalarSizeInBits() != 1 && + (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, N00VT))) { + SDValue SetCC = DAG.getSetCC(DL, SetCCVT, N00, N01, CC); + return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, Zero); } } } @@ -6502,21 +7324,23 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { // fold (sext x) -> (zext x) if the sign bit is known zero. if ((!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) && DAG.SignBitIsZero(N0)) - return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0); + return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0); + + if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) + return NewVSel; return SDValue(); } // isTruncateOf - If N is a truncate of some other value, return true, record -// the value being truncated in Op and which of Op's bits are zero in KnownZero. -// This function computes KnownZero to avoid a duplicated call to +// the value being truncated in Op and which of Op's bits are zero/one in Known. +// This function computes KnownBits to avoid a duplicated call to // computeKnownBits in the caller. static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op, - APInt &KnownZero) { - APInt KnownOne; + KnownBits &Known) { if (N->getOpcode() == ISD::TRUNCATE) { Op = N->getOperand(0); - DAG.computeKnownBits(Op, KnownZero, KnownOne); + DAG.computeKnownBits(Op, Known); return true; } @@ -6535,9 +7359,9 @@ static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op, else return false; - DAG.computeKnownBits(Op, KnownZero, KnownOne); + DAG.computeKnownBits(Op, Known); - if (!(KnownZero | APInt(Op.getValueSizeInBits(), 1)).isAllOnesValue()) + if (!(Known.Zero | 1).isAllOnesValue()) return false; return true; @@ -6562,8 +7386,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { // This is valid when the truncated bits of x are already zero. // FIXME: We should extend this to work for vectors too. SDValue Op; - APInt KnownZero; - if (!VT.isVector() && isTruncateOf(DAG, N0, Op, KnownZero)) { + KnownBits Known; + if (!VT.isVector() && isTruncateOf(DAG, N0, Op, Known)) { APInt TruncatedBits = (Op.getValueSizeInBits() == N0.getValueSizeInBits()) ? APInt(Op.getValueSizeInBits(), 0) : @@ -6571,14 +7395,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { N0.getValueSizeInBits(), std::min(Op.getValueSizeInBits(), VT.getSizeInBits())); - if (TruncatedBits == (KnownZero & TruncatedBits)) { - if (VT.bitsGT(Op.getValueType())) - return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Op); - if (VT.bitsLT(Op.getValueType())) - return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op); - - return Op; - } + if (TruncatedBits.isSubsetOf(Known.Zero)) + return DAG.getZExtOrTrunc(Op, SDLoc(N), VT); } // fold (zext (truncate (load x))) -> (zext (smaller load x)) @@ -6625,14 +7443,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { } if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) { - SDValue Op = N0.getOperand(0); - if (SrcVT.bitsLT(VT)) { - Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Op); - AddToWorklist(Op.getNode()); - } else if (SrcVT.bitsGT(VT)) { - Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op); - AddToWorklist(Op.getNode()); - } + SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT); + AddToWorklist(Op.getNode()); return DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType()); } } @@ -6646,11 +7458,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { N0.getValueType()) || !TLI.isZExtFree(N0.getValueType(), VT))) { SDValue X = N0.getOperand(0).getOperand(0); - if (X.getValueType().bitsLT(VT)) { - X = DAG.getNode(ISD::ANY_EXTEND, SDLoc(X), VT, X); - } else if (X.getValueType().bitsGT(VT)) { - X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X); - } + X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT); APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); Mask = Mask.zext(VT.getSizeInBits()); SDLoc DL(N); @@ -6677,14 +7485,18 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); - CombineTo(N, ExtLoad); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); - CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1)); - - ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), - ISD::ZERO_EXTEND); - return SDValue(N, 0); // Return N so it doesn't get rechecked! + ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), ISD::ZERO_EXTEND); + // If the load value is used only by N, replace it via CombineTo N. + bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse(); + CombineTo(N, ExtLoad); + if (NoReplaceTrunc) + DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); + else + CombineTo(LN0, Trunc, ExtLoad.getValue(1)); + return SDValue(N, 0); // Return N so it doesn't get rechecked! } } @@ -6734,11 +7546,14 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0.getOperand(0)), N0.getOperand(0).getValueType(), ExtLoad); + ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::ZERO_EXTEND); + bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse(); CombineTo(N, And); - CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1)); - ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, - ISD::ZERO_EXTEND); - return SDValue(N, 0); // Return N so it doesn't get rechecked! + if (NoReplaceTrunc) + DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); + else + CombineTo(LN0, Trunc, ExtLoad.getValue(1)); + return SDValue(N,0); // Return N so it doesn't get rechecked! } } } @@ -6837,6 +7652,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { ShAmt); } + if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) + return NewVSel; + return SDValue(); } @@ -6871,14 +7689,8 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { } // fold (aext (truncate x)) - if (N0.getOpcode() == ISD::TRUNCATE) { - SDValue TruncOp = N0.getOperand(0); - if (TruncOp.getValueType() == VT) - return TruncOp; // x iff x size == zext size. - if (TruncOp.getValueType().bitsGT(VT)) - return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, TruncOp); - return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, TruncOp); - } + if (N0.getOpcode() == ISD::TRUNCATE) + return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT); // Fold (aext (and (trunc x), cst)) -> (and x, cst) // if the trunc is not free. @@ -6889,11 +7701,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { N0.getValueType())) { SDLoc DL(N); SDValue X = N0.getOperand(0).getOperand(0); - if (X.getValueType().bitsLT(VT)) { - X = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X); - } else if (X.getValueType().bitsGT(VT)) { - X = DAG.getNode(ISD::TRUNCATE, DL, VT, X); - } + X = DAG.getAnyExtOrTrunc(X, DL, VT); APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); Mask = Mask.zext(VT.getSizeInBits()); return DAG.getNode(ISD::AND, DL, VT, @@ -6917,13 +7725,18 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); - CombineTo(N, ExtLoad); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); - CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1)); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), ISD::ANY_EXTEND); - return SDValue(N, 0); // Return N so it doesn't get rechecked! + // If the load value is used only by N, replace it via CombineTo N. + bool NoReplaceTrunc = N0.hasOneUse(); + CombineTo(N, ExtLoad); + if (NoReplaceTrunc) + DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); + else + CombineTo(LN0, Trunc, ExtLoad.getValue(1)); + return SDValue(N, 0); // Return N so it doesn't get rechecked! } } @@ -6991,9 +7804,25 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitAssertZext(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT EVT = cast<VTSDNode>(N1)->getVT(); + + // fold (assertzext (assertzext x, vt), vt) -> (assertzext x, vt) + if (N0.getOpcode() == ISD::AssertZext && + EVT == cast<VTSDNode>(N0.getOperand(1))->getVT()) + return N0; + + return SDValue(); +} + /// See if the specified operand can be simplified with the knowledge that only /// the bits specified by Mask are used. If so, return the simpler operand, /// otherwise return a null SDValue. +/// +/// (This exists alongside SimplifyDemandedBits because GetDemandedBits can +/// simplify nodes with multiple uses more aggressively.) SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) { switch (V.getOpcode()) { default: break; @@ -7029,6 +7858,14 @@ SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) { return DAG.getNode(ISD::SRL, SDLoc(V), V.getValueType(), SimplifyLHS, V.getOperand(1)); } + break; + case ISD::AND: { + // X & -1 -> X (ignoring bits which aren't demanded). + ConstantSDNode *AndVal = isConstOrConstSplat(V.getOperand(1)); + if (AndVal && (AndVal->getAPIntValue() & Mask) == Mask) + return V.getOperand(0); + break; + } } return SDValue(); } @@ -7169,7 +8006,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { SDValue NewPtr = DAG.getNode(ISD::ADD, DL, PtrType, LN0->getBasePtr(), DAG.getConstant(PtrOff, DL, PtrType), - &Flags); + Flags); AddToWorklist(NewPtr.getNode()); SDValue Load; @@ -7244,6 +8081,16 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1); } + // fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_in_reg x) + if ((N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG || + N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG || + N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) && + N0.getOperand(0).getScalarValueSizeInBits() == EVTBits) { + if (!LegalOperations || + TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT)) + return DAG.getSignExtendVectorInReg(N0.getOperand(0), SDLoc(N), VT); + } + // fold (sext_in_reg (zext x)) -> (sext x) // iff we are extending the source sign bit. if (N0.getOpcode() == ISD::ZERO_EXTEND) { @@ -7254,7 +8101,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { } // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero. - if (DAG.MaskedValueIsZero(N0, APInt::getBitsSet(VTBits, EVTBits-1, EVTBits))) + if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, EVTBits - 1))) return DAG.getZeroExtendInReg(N0, SDLoc(N), EVT.getScalarType()); // fold operands of sext_in_reg based on knowledge that the top bits are not @@ -7439,18 +8286,20 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { if (N0.getOpcode() == ISD::SHL && N0.hasOneUse() && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::SHL, VT)) && TLI.isTypeDesirableForOp(ISD::SHL, VT)) { - if (const ConstantSDNode *CAmt = isConstOrConstSplat(N0.getOperand(1))) { - uint64_t Amt = CAmt->getZExtValue(); - unsigned Size = VT.getScalarSizeInBits(); - - if (Amt < Size) { - SDLoc SL(N); - EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); + SDValue Amt = N0.getOperand(1); + KnownBits Known; + DAG.computeKnownBits(Amt, Known); + unsigned Size = VT.getScalarSizeInBits(); + if (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size)) { + SDLoc SL(N); + EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0)); - return DAG.getNode(ISD::SHL, SL, VT, Trunc, - DAG.getConstant(Amt, SL, AmtVT)); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0)); + if (AmtVT != Amt.getValueType()) { + Amt = DAG.getZExtOrTrunc(Amt, SL, AmtVT); + AddToWorklist(Amt.getNode()); } + return DAG.getNode(ISD::SHL, SL, VT, Trunc, Amt); } } @@ -7496,6 +8345,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { VT.getSizeInBits()))) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter); } + // fold (truncate (load x)) -> (smaller load x) // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits)) if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) { @@ -7517,6 +8367,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { } } } + // fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)), // where ... are all 'undef'. if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) { @@ -7582,6 +8433,22 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); + // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry) + // (trunc addcarry(X, Y, Carry)) -> (addcarry trunc(X), trunc(Y), Carry) + // When the adde's carry is not used. + if ((N0.getOpcode() == ISD::ADDE || N0.getOpcode() == ISD::ADDCARRY) && + N0.hasOneUse() && !N0.getNode()->hasAnyUseOfValue(1) && + (!LegalOperations || TLI.isOperationLegal(N0.getOpcode(), VT))) { + SDLoc SL(N); + auto X = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0)); + auto Y = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1)); + auto VTs = DAG.getVTList(VT, N0->getValueType(1)); + return DAG.getNode(N0.getOpcode(), SL, VTs, X, Y, N0.getOperand(2)); + } + + if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) + return NewVSel; + return SDValue(); } @@ -7645,11 +8512,11 @@ static SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG, switch (N0.getOpcode()) { case ISD::AND: FPOpcode = ISD::FABS; - SignMask = ~APInt::getSignBit(SourceVT.getSizeInBits()); + SignMask = ~APInt::getSignMask(SourceVT.getSizeInBits()); break; case ISD::XOR: FPOpcode = ISD::FNEG; - SignMask = APInt::getSignBit(SourceVT.getSizeInBits()); + SignMask = APInt::getSignMask(SourceVT.getSizeInBits()); break; // TODO: ISD::OR --> ISD::FNABS? default: @@ -7672,6 +8539,9 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + if (N0.isUndef()) + return DAG.getUNDEF(VT); + // If the input is a BUILD_VECTOR with all constant elements, fold this now. // Only do this before legalize, since afterward the target may be depending // on the bitconvert. @@ -7757,7 +8627,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) { assert(VT.getSizeInBits() == 128); SDValue SignBit = DAG.getConstant( - APInt::getSignBit(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64); + APInt::getSignMask(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64); SDValue FlipBit; if (N0.getOpcode() == ISD::FNEG) { FlipBit = SignBit; @@ -7777,7 +8647,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { AddToWorklist(FlipBits.getNode()); return DAG.getNode(ISD::XOR, DL, VT, NewConv, FlipBits); } - APInt SignBit = APInt::getSignBit(VT.getSizeInBits()); + APInt SignBit = APInt::getSignMask(VT.getSizeInBits()); if (N0.getOpcode() == ISD::FNEG) return DAG.getNode(ISD::XOR, DL, VT, NewConv, DAG.getConstant(SignBit, DL, VT)); @@ -7825,7 +8695,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { } if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) { - APInt SignBit = APInt::getSignBit(VT.getSizeInBits() / 2); + APInt SignBit = APInt::getSignMask(VT.getSizeInBits() / 2); SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0)); AddToWorklist(Cst.getNode()); SDValue X = DAG.getBitcast(VT, N0.getOperand(1)); @@ -7846,7 +8716,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { AddToWorklist(FlipBits.getNode()); return DAG.getNode(ISD::XOR, SDLoc(N), VT, Cst, FlipBits); } - APInt SignBit = APInt::getSignBit(VT.getSizeInBits()); + APInt SignBit = APInt::getSignMask(VT.getSizeInBits()); X = DAG.getNode(ISD::AND, SDLoc(X), VT, X, DAG.getConstant(SignBit, SDLoc(X), VT)); AddToWorklist(X.getNode()); @@ -8029,7 +8899,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) { for (unsigned j = 0; j != NumOutputsPerInput; ++j) { APInt ThisVal = OpVal.trunc(DstBitSize); Ops.push_back(DAG.getConstant(ThisVal, DL, DstEltVT)); - OpVal = OpVal.lshr(DstBitSize); + OpVal.lshrInPlace(DstBitSize); } // For big endian targets, swap the order of the pieces of each element. @@ -8040,6 +8910,11 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) { return DAG.getBuildVector(VT, DL, Ops); } +static bool isContractable(SDNode *N) { + SDNodeFlags F = N->getFlags(); + return F.hasAllowContract() || F.hasUnsafeAlgebra(); +} + /// Try to perform FMA combining on a given FADD node. SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDValue N0 = N->getOperand(0); @@ -8048,24 +8923,27 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDLoc SL(N); const TargetOptions &Options = DAG.getTarget().Options; - bool AllowFusion = - (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); // Floating-point multiply-add with intermediate rounding. bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)); // Floating-point multiply-add without intermediate rounding. bool HasFMA = - AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) && + TLI.isFMAFasterThanFMulAndFAdd(VT) && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); // No valid opcode, do not combine. if (!HasFMAD && !HasFMA) return SDValue(); + bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || + Options.UnsafeFPMath || HasFMAD); + // If the addition is not contractable, do not combine. + if (!AllowFusionGlobally && !isContractable(N)) + return SDValue(); + const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo(); - ; - if (AllowFusion && STI && STI->generateFMAsInMachineCombiner(OptLevel)) + if (STI && STI->generateFMAsInMachineCombiner(OptLevel)) return SDValue(); // Always prefer FMAD to FMA for precision. @@ -8073,35 +8951,39 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { bool Aggressive = TLI.enableAggressiveFMAFusion(VT); bool LookThroughFPExt = TLI.isFPExtFree(VT); + // Is the node an FMUL and contractable either due to global flags or + // SDNodeFlags. + auto isContractableFMUL = [AllowFusionGlobally](SDValue N) { + if (N.getOpcode() != ISD::FMUL) + return false; + return AllowFusionGlobally || isContractable(N.getNode()); + }; // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), // prefer to fold the multiply with fewer uses. - if (Aggressive && N0.getOpcode() == ISD::FMUL && - N1.getOpcode() == ISD::FMUL) { + if (Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1)) { if (N0.getNode()->use_size() > N1.getNode()->use_size()) std::swap(N0, N1); } // fold (fadd (fmul x, y), z) -> (fma x, y, z) - if (N0.getOpcode() == ISD::FMUL && - (Aggressive || N0->hasOneUse())) { + if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), N1); } // fold (fadd x, (fmul y, z)) -> (fma y, z, x) // Note: Commutes FADD operands. - if (N1.getOpcode() == ISD::FMUL && - (Aggressive || N1->hasOneUse())) { + if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(0), N1.getOperand(1), N0); } // Look through FP_EXTEND nodes to do more combining. - if (AllowFusion && LookThroughFPExt) { + if (LookThroughFPExt) { // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N00)) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), @@ -8113,7 +8995,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { // Note: Commutes FADD operands. if (N1.getOpcode() == ISD::FP_EXTEND) { SDValue N10 = N1.getOperand(0); - if (N10.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N10)) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0)), @@ -8154,7 +9036,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { N0)); } - if (AllowFusion && LookThroughFPExt) { + if (LookThroughFPExt) { // fold (fadd (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y, (fma (fpext u), (fpext v), z)) auto FoldFAddFMAFPExtFMul = [&] ( @@ -8169,7 +9051,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDValue N02 = N0.getOperand(2); if (N02.getOpcode() == ISD::FP_EXTEND) { SDValue N020 = N02.getOperand(0); - if (N020.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N020)) return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1), N020.getOperand(0), N020.getOperand(1), N1); @@ -8195,7 +9077,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == PreferredFusedOpcode) { SDValue N002 = N00.getOperand(2); - if (N002.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N002)) return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1), N002.getOperand(0), N002.getOperand(1), N1); @@ -8208,7 +9090,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDValue N12 = N1.getOperand(2); if (N12.getOpcode() == ISD::FP_EXTEND) { SDValue N120 = N12.getOperand(0); - if (N120.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N120)) return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1), N120.getOperand(0), N120.getOperand(1), N0); @@ -8224,7 +9106,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDValue N10 = N1.getOperand(0); if (N10.getOpcode() == PreferredFusedOpcode) { SDValue N102 = N10.getOperand(2); - if (N102.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N102)) return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1), N102.getOperand(0), N102.getOperand(1), N0); @@ -8244,23 +9126,26 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { SDLoc SL(N); const TargetOptions &Options = DAG.getTarget().Options; - bool AllowFusion = - (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); - // Floating-point multiply-add with intermediate rounding. bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)); // Floating-point multiply-add without intermediate rounding. bool HasFMA = - AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) && + TLI.isFMAFasterThanFMulAndFAdd(VT) && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); // No valid opcode, do not combine. if (!HasFMAD && !HasFMA) return SDValue(); + bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || + Options.UnsafeFPMath || HasFMAD); + // If the subtraction is not contractable, do not combine. + if (!AllowFusionGlobally && !isContractable(N)) + return SDValue(); + const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo(); - if (AllowFusion && STI && STI->generateFMAsInMachineCombiner(OptLevel)) + if (STI && STI->generateFMAsInMachineCombiner(OptLevel)) return SDValue(); // Always prefer FMAD to FMA for precision. @@ -8268,9 +9153,16 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { bool Aggressive = TLI.enableAggressiveFMAFusion(VT); bool LookThroughFPExt = TLI.isFPExtFree(VT); + // Is the node an FMUL and contractable either due to global flags or + // SDNodeFlags. + auto isContractableFMUL = [AllowFusionGlobally](SDValue N) { + if (N.getOpcode() != ISD::FMUL) + return false; + return AllowFusionGlobally || isContractable(N.getNode()); + }; + // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) - if (N0.getOpcode() == ISD::FMUL && - (Aggressive || N0->hasOneUse())) { + if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, N1)); @@ -8278,16 +9170,14 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) // Note: Commutes FSUB operands. - if (N1.getOpcode() == ISD::FMUL && - (Aggressive || N1->hasOneUse())) + if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1), N0); // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z)) - if (N0.getOpcode() == ISD::FNEG && - N0.getOperand(0).getOpcode() == ISD::FMUL && + if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) && (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) { SDValue N00 = N0.getOperand(0).getOperand(0); SDValue N01 = N0.getOperand(0).getOperand(1); @@ -8297,12 +9187,12 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { } // Look through FP_EXTEND nodes to do more combining. - if (AllowFusion && LookThroughFPExt) { + if (LookThroughFPExt) { // fold (fsub (fpext (fmul x, y)), z) // -> (fma (fpext x), (fpext y), (fneg z)) if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N00)) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), @@ -8316,7 +9206,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // Note: Commutes FSUB operands. if (N1.getOpcode() == ISD::FP_EXTEND) { SDValue N10 = N1.getOperand(0); - if (N10.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N10)) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, @@ -8336,7 +9226,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == ISD::FNEG) { SDValue N000 = N00.getOperand(0); - if (N000.getOpcode() == ISD::FMUL) { + if (isContractableFMUL(N000)) { return DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, @@ -8358,7 +9248,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == ISD::FP_EXTEND) { SDValue N000 = N00.getOperand(0); - if (N000.getOpcode() == ISD::FMUL) { + if (isContractableFMUL(N000)) { return DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, @@ -8378,10 +9268,9 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // -> (fma x, y (fma u, v, (fneg z))) // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF // are currently only supported on binary nodes. - if (Options.UnsafeFPMath && - N0.getOpcode() == PreferredFusedOpcode && - N0.getOperand(2).getOpcode() == ISD::FMUL && - N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) { + if (Options.UnsafeFPMath && N0.getOpcode() == PreferredFusedOpcode && + isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() && + N0.getOperand(2)->hasOneUse()) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -8395,9 +9284,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // -> (fma (fneg y), z, (fma (fneg u), v, x)) // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF // are currently only supported on binary nodes. - if (Options.UnsafeFPMath && - N1.getOpcode() == PreferredFusedOpcode && - N1.getOperand(2).getOpcode() == ISD::FMUL) { + if (Options.UnsafeFPMath && N1.getOpcode() == PreferredFusedOpcode && + isContractableFMUL(N1.getOperand(2))) { SDValue N20 = N1.getOperand(2).getOperand(0); SDValue N21 = N1.getOperand(2).getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -8410,14 +9298,14 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { N21, N0)); } - if (AllowFusion && LookThroughFPExt) { + if (LookThroughFPExt) { // fold (fsub (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y (fma (fpext u), (fpext v), (fneg z))) if (N0.getOpcode() == PreferredFusedOpcode) { SDValue N02 = N0.getOperand(2); if (N02.getOpcode() == ISD::FP_EXTEND) { SDValue N020 = N02.getOperand(0); - if (N020.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N020)) return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -8440,7 +9328,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == PreferredFusedOpcode) { SDValue N002 = N00.getOperand(2); - if (N002.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N002)) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), @@ -8461,7 +9349,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (N1.getOpcode() == PreferredFusedOpcode && N1.getOperand(2).getOpcode() == ISD::FP_EXTEND) { SDValue N120 = N1.getOperand(2).getOperand(0); - if (N120.getOpcode() == ISD::FMUL) { + if (isContractableFMUL(N120)) { SDValue N1200 = N120.getOperand(0); SDValue N1201 = N120.getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -8488,7 +9376,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { SDValue N100 = N1.getOperand(0).getOperand(0); SDValue N101 = N1.getOperand(0).getOperand(1); SDValue N102 = N1.getOperand(0).getOperand(2); - if (N102.getOpcode() == ISD::FMUL) { + if (isContractableFMUL(N102)) { SDValue N1020 = N102.getOperand(0); SDValue N1021 = N102.getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -8601,6 +9489,14 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) { return SDValue(); } +static bool isFMulNegTwo(SDValue &N) { + if (N.getOpcode() != ISD::FMUL) + return false; + if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N.getOperand(1))) + return CFP->isExactlyValue(-2.0); + return false; +} + SDValue DAGCombiner::visitFADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -8609,7 +9505,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; - const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags; + const SDNodeFlags Flags = N->getFlags(); // fold vector ops if (VT.isVector()) @@ -8624,6 +9520,9 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { if (N0CFP && !N1CFP) return DAG.getNode(ISD::FADD, DL, VT, N1, N0, Flags); + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // fold (fadd A, (fneg B)) -> (fsub A, B) if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) && isNegatibleForFree(N1, LegalOperations, TLI, &Options) == 2) @@ -8636,8 +9535,18 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { return DAG.getNode(ISD::FSUB, DL, VT, N1, GetNegatedExpression(N0, DAG, LegalOperations), Flags); + // fold (fadd A, (fmul B, -2.0)) -> (fsub A, (fadd B, B)) + // fold (fadd (fmul B, -2.0), A) -> (fsub A, (fadd B, B)) + if ((isFMulNegTwo(N0) && N0.hasOneUse()) || + (isFMulNegTwo(N1) && N1.hasOneUse())) { + bool N1IsFMul = isFMulNegTwo(N1); + SDValue AddOp = N1IsFMul ? N1.getOperand(0) : N0.getOperand(0); + SDValue Add = DAG.getNode(ISD::FADD, DL, VT, AddOp, AddOp, Flags); + return DAG.getNode(ISD::FSUB, DL, VT, N1IsFMul ? N0 : N1, Add, Flags); + } + // FIXME: Auto-upgrade the target/function-level option. - if (Options.UnsafeFPMath || N->getFlags()->hasNoSignedZeros()) { + if (Options.NoSignedZerosFPMath || N->getFlags().hasNoSignedZeros()) { // fold (fadd A, 0) -> A if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1)) if (N1C->isZero()) @@ -8760,7 +9669,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; - const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags; + const SDNodeFlags Flags = N->getFlags(); // fold vector ops if (VT.isVector()) @@ -8771,13 +9680,16 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { if (N0CFP && N1CFP) return DAG.getNode(ISD::FSUB, DL, VT, N0, N1, Flags); + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // fold (fsub A, (fneg B)) -> (fadd A, B) if (isNegatibleForFree(N1, LegalOperations, TLI, &Options)) return DAG.getNode(ISD::FADD, DL, VT, N0, GetNegatedExpression(N1, DAG, LegalOperations), Flags); // FIXME: Auto-upgrade the target/function-level option. - if (Options.UnsafeFPMath || N->getFlags()->hasNoSignedZeros()) { + if (Options.NoSignedZerosFPMath || N->getFlags().hasNoSignedZeros()) { // (fsub 0, B) -> -B if (N0CFP && N0CFP->isZero()) { if (isNegatibleForFree(N1, LegalOperations, TLI, &Options)) @@ -8828,7 +9740,7 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; - const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags; + const SDNodeFlags Flags = N->getFlags(); // fold vector ops if (VT.isVector()) { @@ -8850,6 +9762,9 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { if (N1CFP && N1CFP->isExactlyValue(1.0)) return N0; + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + if (Options.UnsafeFPMath) { // fold (fmul A, 0) -> 0 if (N1CFP && N1CFP->isZero()) @@ -8914,6 +9829,52 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { } } + // fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X)) + // fold (fmul X, (select (fcmp X > 0.0), 1.0, -1.0)) -> (fabs X) + if (Flags.hasNoNaNs() && Flags.hasNoSignedZeros() && + (N0.getOpcode() == ISD::SELECT || N1.getOpcode() == ISD::SELECT) && + TLI.isOperationLegal(ISD::FABS, VT)) { + SDValue Select = N0, X = N1; + if (Select.getOpcode() != ISD::SELECT) + std::swap(Select, X); + + SDValue Cond = Select.getOperand(0); + auto TrueOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(1)); + auto FalseOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(2)); + + if (TrueOpnd && FalseOpnd && + Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == X && + isa<ConstantFPSDNode>(Cond.getOperand(1)) && + cast<ConstantFPSDNode>(Cond.getOperand(1))->isExactlyValue(0.0)) { + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + switch (CC) { + default: break; + case ISD::SETOLT: + case ISD::SETULT: + case ISD::SETOLE: + case ISD::SETULE: + case ISD::SETLT: + case ISD::SETLE: + std::swap(TrueOpnd, FalseOpnd); + // Fall through + case ISD::SETOGT: + case ISD::SETUGT: + case ISD::SETOGE: + case ISD::SETUGE: + case ISD::SETGT: + case ISD::SETGE: + if (TrueOpnd->isExactlyValue(-1.0) && FalseOpnd->isExactlyValue(1.0) && + TLI.isOperationLegal(ISD::FNEG, VT)) + return DAG.getNode(ISD::FNEG, DL, VT, + DAG.getNode(ISD::FABS, DL, VT, X)); + if (TrueOpnd->isExactlyValue(1.0) && FalseOpnd->isExactlyValue(-1.0)) + return DAG.getNode(ISD::FABS, DL, VT, X); + + break; + } + } + } + // FMUL -> FMA combines: if (SDValue Fused = visitFMULForFMADistributiveCombine(N)) { AddToWorklist(Fused.getNode()); @@ -8969,7 +9930,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) { return DAG.getNode(ISD::FMUL, DL, VT, N0, DAG.getNode(ISD::FADD, DL, VT, N1, N2.getOperand(1), - &Flags), &Flags); + Flags), Flags); } // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y) @@ -8979,7 +9940,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0), DAG.getNode(ISD::FMUL, DL, VT, N1, N0.getOperand(1), - &Flags), + Flags), N2); } } @@ -9005,16 +9966,16 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { if (N1CFP && N0 == N2) { return DAG.getNode(ISD::FMUL, DL, VT, N0, DAG.getNode(ISD::FADD, DL, VT, N1, - DAG.getConstantFP(1.0, DL, VT), &Flags), - &Flags); + DAG.getConstantFP(1.0, DL, VT), Flags), + Flags); } // (fma x, c, (fneg x)) -> (fmul x, (c-1)) if (N1CFP && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) { return DAG.getNode(ISD::FMUL, DL, VT, N0, DAG.getNode(ISD::FADD, DL, VT, N1, - DAG.getConstantFP(-1.0, DL, VT), &Flags), - &Flags); + DAG.getConstantFP(-1.0, DL, VT), Flags), + Flags); } } @@ -9030,8 +9991,8 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { // is the critical path is increased from "one FDIV" to "one FDIV + one FMUL". SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) { bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath; - const SDNodeFlags *Flags = N->getFlags(); - if (!UnsafeMath && !Flags->hasAllowReciprocal()) + const SDNodeFlags Flags = N->getFlags(); + if (!UnsafeMath && !Flags.hasAllowReciprocal()) return SDValue(); // Skip if current node is a reciprocal. @@ -9054,7 +10015,7 @@ SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) { if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) { // This division is eligible for optimization only if global unsafe math // is enabled or if this division allows reciprocal formation. - if (UnsafeMath || U->getFlags()->hasAllowReciprocal()) + if (UnsafeMath || U->getFlags().hasAllowReciprocal()) Users.insert(U); } } @@ -9093,7 +10054,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; - SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags; + SDNodeFlags Flags = N->getFlags(); // fold vector ops if (VT.isVector()) @@ -9104,6 +10065,9 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { if (N0CFP && N1CFP) return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags); + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + if (Options.UnsafeFPMath) { // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable. if (N1CFP) { @@ -9204,8 +10168,10 @@ SDValue DAGCombiner::visitFREM(SDNode *N) { // fold (frem c1, c2) -> fmod(c1,c2) if (N0CFP && N1CFP) - return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1, - &cast<BinaryWithFlagsSDNode>(N)->Flags); + return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1, N->getFlags()); + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; return SDValue(); } @@ -9222,7 +10188,7 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) { // For now, create a Flags object for use with all unsafe math transforms. SDNodeFlags Flags; Flags.setUnsafeAlgebra(true); - return buildSqrtEstimate(N0, &Flags); + return buildSqrtEstimate(N0, Flags); } /// copysign(x, fp_extend(y)) -> copysign(x, y) @@ -9497,6 +10463,9 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) { Tmp, N0.getOperand(1)); } + if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) + return NewVSel; + return SDValue(); } @@ -9563,6 +10532,9 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) { return SDValue(N, 0); // Return N so it doesn't get rechecked! } + if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) + return NewVSel; + return SDValue(); } @@ -9624,11 +10596,11 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) { if (N0.getValueType().isVector()) { // For a vector, get a mask such as 0x80... per scalar element // and splat it. - SignMask = APInt::getSignBit(N0.getScalarValueSizeInBits()); + SignMask = APInt::getSignMask(N0.getScalarValueSizeInBits()); SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask); } else { // For a scalar, just generate 0x80... - SignMask = APInt::getSignBit(IntVT.getSizeInBits()); + SignMask = APInt::getSignMask(IntVT.getSizeInBits()); } SDLoc DL0(N0); Int = DAG.getNode(ISD::XOR, DL0, IntVT, Int, @@ -9648,10 +10620,10 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) { if (Level >= AfterLegalizeDAG && (TLI.isFPImmLegal(CVal, VT) || TLI.isOperationLegal(ISD::ConstantFP, VT))) - return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0.getOperand(0), - DAG.getNode(ISD::FNEG, SDLoc(N), VT, - N0.getOperand(1)), - &cast<BinaryWithFlagsSDNode>(N0)->Flags); + return DAG.getNode( + ISD::FMUL, SDLoc(N), VT, N0.getOperand(0), + DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0.getOperand(1)), + N0->getFlags()); } } @@ -9729,11 +10701,11 @@ SDValue DAGCombiner::visitFABS(SDNode *N) { if (N0.getValueType().isVector()) { // For a vector, get a mask such as 0x7f... per scalar element // and splat it. - SignMask = ~APInt::getSignBit(N0.getScalarValueSizeInBits()); + SignMask = ~APInt::getSignMask(N0.getScalarValueSizeInBits()); SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask); } else { // For a scalar, just generate 0x7f... - SignMask = ~APInt::getSignBit(IntVT.getSizeInBits()); + SignMask = ~APInt::getSignMask(IntVT.getSizeInBits()); } SDLoc DL(N0); Int = DAG.getNode(ISD::AND, DL, IntVT, Int, @@ -10149,7 +11121,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { // x1 * offset1 + y1 * ptr0 = t1 (the indexed load/store) // // where x0, x1, y0 and y1 in {-1, 1} are given by the types of the - // indexed load/store and the expresion that needs to be re-written. + // indexed load/store and the expression that needs to be re-written. // // Therefore, we have: // t0 = (x0 * offset0 - x1 * y0 * y1 *offset1) + (y0 * y1) * t1 @@ -10361,7 +11333,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { dbgs() << "\n"); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); - + AddUsersToWorklist(Chain.getNode()); if (N->use_empty()) deleteAndRecombine(N); @@ -10414,7 +11386,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { StoreSDNode *PrevST = cast<StoreSDNode>(Chain); if (PrevST->getBasePtr() == Ptr && PrevST->getValue().getValueType() == N->getValueType(0)) - return CombineTo(N, Chain.getOperand(1), Chain); + return CombineTo(N, PrevST->getOperand(1), Chain); } } @@ -10432,14 +11404,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { } } - bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA - : DAG.getSubtarget().useAA(); -#ifndef NDEBUG - if (CombinerAAOnlyFunc.getNumOccurrences() && - CombinerAAOnlyFunc != DAG.getMachineFunction().getName()) - UseAA = false; -#endif - if (UseAA && LD->isUnindexed()) { + if (LD->isUnindexed()) { // Walk up chain skipping non-aliasing memory nodes. SDValue BetterChain = FindBetterChain(N, Chain); @@ -10462,12 +11427,8 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Chain, ReplLoad.getValue(1)); - // Make sure the new and old chains are cleaned up. - AddToWorklist(Token.getNode()); - - // Replace uses with load result and token factor. Don't add users - // to work list. - return CombineTo(N, ReplLoad.getValue(0), Token, false); + // Replace uses with load result and token factor + return CombineTo(N, ReplLoad.getValue(0), Token); } } @@ -10490,7 +11451,7 @@ namespace { /// Shift = srl Ty1 Origin, CstTy Amount /// Inst = trunc Shift to Ty2 /// -/// Then, it will be rewriten into: +/// Then, it will be rewritten into: /// Slice = load SliceTy, Base + SliceOffset /// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2 /// @@ -10959,7 +11920,7 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) { // Check if this is a trunc(lshr). if (User->getOpcode() == ISD::SRL && User->hasOneUse() && isa<ConstantSDNode>(User->getOperand(1))) { - Shift = cast<ConstantSDNode>(User->getOperand(1))->getZExtValue(); + Shift = User->getConstantOperandVal(1); User = *User->use_begin(); } @@ -11021,6 +11982,7 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) { SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, ArgChains); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); + AddToWorklist(Chain.getNode()); return true; } @@ -11414,44 +12376,36 @@ bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode, return false; } -SDValue DAGCombiner::getMergedConstantVectorStore( - SelectionDAG &DAG, const SDLoc &SL, ArrayRef<MemOpLink> Stores, - SmallVectorImpl<SDValue> &Chains, EVT Ty) const { - SmallVector<SDValue, 8> BuildVector; +SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes, + unsigned NumStores) { + SmallVector<SDValue, 8> Chains; + SmallPtrSet<const SDNode *, 8> Visited; + SDLoc StoreDL(StoreNodes[0].MemNode); + + for (unsigned i = 0; i < NumStores; ++i) { + Visited.insert(StoreNodes[i].MemNode); + } - for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) { - StoreSDNode *St = cast<StoreSDNode>(Stores[I].MemNode); - Chains.push_back(St->getChain()); - BuildVector.push_back(St->getValue()); + // don't include nodes that are children + for (unsigned i = 0; i < NumStores; ++i) { + if (Visited.count(StoreNodes[i].MemNode->getChain().getNode()) == 0) + Chains.push_back(StoreNodes[i].MemNode->getChain()); } - return DAG.getBuildVector(Ty, SL, BuildVector); + assert(Chains.size() > 0 && "Chain should have generated a chain"); + return DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, Chains); } bool DAGCombiner::MergeStoresOfConstantsOrVecElts( - SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, - unsigned NumStores, bool IsConstantSrc, bool UseVector) { + SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores, + bool IsConstantSrc, bool UseVector, bool UseTrunc) { // Make sure we have something to merge. if (NumStores < 2) return false; int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8; - LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; - unsigned LatestNodeUsed = 0; - - for (unsigned i=0; i < NumStores; ++i) { - // Find a chain for the new wide-store operand. Notice that some - // of the store nodes that we found may not be selected for inclusion - // in the wide store. The chain we use needs to be the chain of the - // latest store node which is *used* and replaced by the wide store. - if (StoreNodes[i].SequenceNum < StoreNodes[LatestNodeUsed].SequenceNum) - LatestNodeUsed = i; - } - - SmallVector<SDValue, 8> Chains; // The latest Node in the DAG. - LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode; SDLoc DL(StoreNodes[0].MemNode); SDValue StoredVal; @@ -11467,7 +12421,18 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( assert(TLI.isTypeLegal(Ty) && "Illegal vector store"); if (IsConstantSrc) { - StoredVal = getMergedConstantVectorStore(DAG, DL, StoreNodes, Chains, Ty); + SmallVector<SDValue, 8> BuildVector; + for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) { + StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode); + SDValue Val = St->getValue(); + if (MemVT.getScalarType().isInteger()) + if (auto *CFP = dyn_cast<ConstantFPSDNode>(St->getValue())) + Val = DAG.getConstant( + (uint32_t)CFP->getValueAPF().bitcastToAPInt().getZExtValue(), + SDLoc(CFP), MemVT); + BuildVector.push_back(Val); + } + StoredVal = DAG.getBuildVector(Ty, DL, BuildVector); } else { SmallVector<SDValue, 8> Ops; for (unsigned i = 0; i < NumStores; ++i) { @@ -11477,7 +12442,6 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( if (Val.getValueType() != MemVT) return false; Ops.push_back(Val); - Chains.push_back(St->getChain()); } // Build the extracted vector elements back into a vector. @@ -11497,14 +12461,13 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( for (unsigned i = 0; i < NumStores; ++i) { unsigned Idx = IsLE ? (NumStores - 1 - i) : i; StoreSDNode *St = cast<StoreSDNode>(StoreNodes[Idx].MemNode); - Chains.push_back(St->getChain()); SDValue Val = St->getValue(); StoreInt <<= ElementSizeBytes * 8; if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) { - StoreInt |= C->getAPIntValue().zext(SizeInBits); + StoreInt |= C->getAPIntValue().zextOrTrunc(SizeInBits); } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) { - StoreInt |= C->getValueAPF().bitcastToAPInt().zext(SizeInBits); + StoreInt |= C->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits); } else { llvm_unreachable("Invalid constant element type"); } @@ -11515,194 +12478,181 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( StoredVal = DAG.getConstant(StoreInt, DL, StoreTy); } - assert(!Chains.empty()); - - SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); - SDValue NewStore = DAG.getStore(NewChain, DL, StoredVal, - FirstInChain->getBasePtr(), - FirstInChain->getPointerInfo(), - FirstInChain->getAlignment()); - - bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA - : DAG.getSubtarget().useAA(); - if (UseAA) { - // Replace all merged stores with the new store. - for (unsigned i = 0; i < NumStores; ++i) - CombineTo(StoreNodes[i].MemNode, NewStore); - } else { - // Replace the last store with the new store. - CombineTo(LatestOp, NewStore); - // Erase all other stores. - for (unsigned i = 0; i < NumStores; ++i) { - if (StoreNodes[i].MemNode == LatestOp) - continue; - StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); - // ReplaceAllUsesWith will replace all uses that existed when it was - // called, but graph optimizations may cause new ones to appear. For - // example, the case in pr14333 looks like - // - // St's chain -> St -> another store -> X - // - // And the only difference from St to the other store is the chain. - // When we change it's chain to be St's chain they become identical, - // get CSEed and the net result is that X is now a use of St. - // Since we know that St is redundant, just iterate. - while (!St->use_empty()) - DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain()); - deleteAndRecombine(St); - } - } - - StoreNodes.erase(StoreNodes.begin() + NumStores, StoreNodes.end()); + LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; + SDValue NewChain = getMergeStoreChains(StoreNodes, NumStores); + + // make sure we use trunc store if it's necessary to be legal. + SDValue NewStore; + if (UseVector || !UseTrunc) { + NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(), + FirstInChain->getPointerInfo(), + FirstInChain->getAlignment()); + } else { // Must be realized as a trunc store + EVT LegalizedStoredValueTy = + TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType()); + unsigned LegalizedStoreSize = LegalizedStoredValueTy.getSizeInBits(); + ConstantSDNode *C = cast<ConstantSDNode>(StoredVal); + SDValue ExtendedStoreVal = + DAG.getConstant(C->getAPIntValue().zextOrTrunc(LegalizedStoreSize), DL, + LegalizedStoredValueTy); + NewStore = DAG.getTruncStore( + NewChain, DL, ExtendedStoreVal, FirstInChain->getBasePtr(), + FirstInChain->getPointerInfo(), StoredVal.getValueType() /*TVT*/, + FirstInChain->getAlignment(), + FirstInChain->getMemOperand()->getFlags()); + } + + // Replace all merged stores with the new store. + for (unsigned i = 0; i < NumStores; ++i) + CombineTo(StoreNodes[i].MemNode, NewStore); + + AddToWorklist(NewChain.getNode()); return true; } -void DAGCombiner::getStoreMergeAndAliasCandidates( - StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes, - SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes) { +void DAGCombiner::getStoreMergeCandidates( + StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes) { // This holds the base pointer, index, and the offset in bytes from the base // pointer. BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG); + EVT MemVT = St->getMemoryVT(); // We must have a base and an offset. - if (!BasePtr.Base.getNode()) + if (!BasePtr.getBase().getNode()) return; // Do not handle stores to undef base pointers. - if (BasePtr.Base.isUndef()) + if (BasePtr.getBase().isUndef()) return; - // Walk up the chain and look for nodes with offsets from the same - // base pointer. Stop when reaching an instruction with a different kind - // or instruction which has a different base pointer. - EVT MemVT = St->getMemoryVT(); - unsigned Seq = 0; - StoreSDNode *Index = St; - - - bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA - : DAG.getSubtarget().useAA(); - - if (UseAA) { - // Look at other users of the same chain. Stores on the same chain do not - // alias. If combiner-aa is enabled, non-aliasing stores are canonicalized - // to be on the same chain, so don't bother looking at adjacent chains. - - SDValue Chain = St->getChain(); - for (auto I = Chain->use_begin(), E = Chain->use_end(); I != E; ++I) { - if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) { - if (I.getOperandNo() != 0) - continue; - - if (OtherST->isVolatile() || OtherST->isIndexed()) - continue; - - if (OtherST->getMemoryVT() != MemVT) - continue; - - BaseIndexOffset Ptr = BaseIndexOffset::match(OtherST->getBasePtr(), DAG); - - if (Ptr.equalBaseIndex(BasePtr)) - StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset, Seq++)); - } + bool IsConstantSrc = isa<ConstantSDNode>(St->getValue()) || + isa<ConstantFPSDNode>(St->getValue()); + bool IsExtractVecSrc = + (St->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT || + St->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR); + bool IsLoadSrc = isa<LoadSDNode>(St->getValue()); + BaseIndexOffset LBasePtr; + // Match on loadbaseptr if relevant. + if (IsLoadSrc) + LBasePtr = BaseIndexOffset::match( + cast<LoadSDNode>(St->getValue())->getBasePtr(), DAG); + + auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr, + int64_t &Offset) -> bool { + if (Other->isVolatile() || Other->isIndexed()) + return false; + // We can merge constant floats to equivalent integers + if (Other->getMemoryVT() != MemVT) + if (!(MemVT.isInteger() && MemVT.bitsEq(Other->getMemoryVT()) && + isa<ConstantFPSDNode>(Other->getValue()))) + return false; + if (IsLoadSrc) { + // The Load's Base Ptr must also match + if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(Other->getValue())) { + auto LPtr = BaseIndexOffset::match(OtherLd->getBasePtr(), DAG); + if (!(LBasePtr.equalBaseIndex(LPtr, DAG))) + return false; + } else + return false; } - - return; - } - - while (Index) { - // If the chain has more than one use, then we can't reorder the mem ops. - if (Index != St && !SDValue(Index, 0)->hasOneUse()) - break; - - // Find the base pointer and offset for this memory node. - BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr(), DAG); - - // Check that the base pointer is the same as the original one. - if (!Ptr.equalBaseIndex(BasePtr)) - break; - - // The memory operands must not be volatile. - if (Index->isVolatile() || Index->isIndexed()) - break; - - // No truncation. - if (Index->isTruncatingStore()) - break; - - // The stored memory type must be the same. - if (Index->getMemoryVT() != MemVT) - break; - - // We do not allow under-aligned stores in order to prevent - // overriding stores. NOTE: this is a bad hack. Alignment SHOULD - // be irrelevant here; what MATTERS is that we not move memory - // operations that potentially overlap past each-other. - if (Index->getAlignment() < MemVT.getStoreSize()) - break; - - // We found a potential memory operand to merge. - StoreNodes.push_back(MemOpLink(Index, Ptr.Offset, Seq++)); - - // Find the next memory operand in the chain. If the next operand in the - // chain is a store then move up and continue the scan with the next - // memory operand. If the next operand is a load save it and use alias - // information to check if it interferes with anything. - SDNode *NextInChain = Index->getChain().getNode(); - while (1) { - if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) { - // We found a store node. Use it for the next iteration. - Index = STn; - break; - } else if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(NextInChain)) { - if (Ldn->isVolatile()) { - Index = nullptr; - break; + if (IsConstantSrc) + if (!(isa<ConstantSDNode>(Other->getValue()) || + isa<ConstantFPSDNode>(Other->getValue()))) + return false; + if (IsExtractVecSrc) + if (!(Other->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT || + Other->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR)) + return false; + Ptr = BaseIndexOffset::match(Other->getBasePtr(), DAG); + return (BasePtr.equalBaseIndex(Ptr, DAG, Offset)); + }; + // We looking for a root node which is an ancestor to all mergable + // stores. We search up through a load, to our root and then down + // through all children. For instance we will find Store{1,2,3} if + // St is Store1, Store2. or Store3 where the root is not a load + // which always true for nonvolatile ops. TODO: Expand + // the search to find all valid candidates through multiple layers of loads. + // + // Root + // |-------|-------| + // Load Load Store3 + // | | + // Store1 Store2 + // + // FIXME: We should be able to climb and + // descend TokenFactors to find candidates as well. + + SDNode *RootNode = (St->getChain()).getNode(); + + if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(RootNode)) { + RootNode = Ldn->getChain().getNode(); + for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I) + if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) // walk down chain + for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2) + if (I2.getOperandNo() == 0) + if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I2)) { + BaseIndexOffset Ptr; + int64_t PtrDiff; + if (CandidateMatch(OtherST, Ptr, PtrDiff)) + StoreNodes.push_back(MemOpLink(OtherST, PtrDiff)); + } + } else + for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I) + if (I.getOperandNo() == 0) + if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) { + BaseIndexOffset Ptr; + int64_t PtrDiff; + if (CandidateMatch(OtherST, Ptr, PtrDiff)) + StoreNodes.push_back(MemOpLink(OtherST, PtrDiff)); } - - // Save the load node for later. Continue the scan. - AliasLoadNodes.push_back(Ldn); - NextInChain = Ldn->getChain().getNode(); - continue; - } else { - Index = nullptr; - break; - } - } - } } -// We need to check that merging these stores does not cause a loop -// in the DAG. Any store candidate may depend on another candidate +// We need to check that merging these stores does not cause a loop in +// the DAG. Any store candidate may depend on another candidate // indirectly through its operand (we already consider dependencies // through the chain). Check in parallel by searching up from // non-chain operands of candidates. + bool DAGCombiner::checkMergeStoreCandidatesForDependencies( - SmallVectorImpl<MemOpLink> &StoreNodes) { + SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores) { + + // FIXME: We should be able to truncate a full search of + // predecessors by doing a BFS and keeping tabs the originating + // stores from which worklist nodes come from in a similar way to + // TokenFactor simplfication. + SmallPtrSet<const SDNode *, 16> Visited; SmallVector<const SDNode *, 8> Worklist; - // search ops of store candidates - for (unsigned i = 0; i < StoreNodes.size(); ++i) { + unsigned int Max = 8192; + // Search Ops of store candidates. + for (unsigned i = 0; i < NumStores; ++i) { SDNode *n = StoreNodes[i].MemNode; // Potential loops may happen only through non-chain operands for (unsigned j = 1; j < n->getNumOperands(); ++j) Worklist.push_back(n->getOperand(j).getNode()); } - // search through DAG. We can stop early if we find a storenode - for (unsigned i = 0; i < StoreNodes.size(); ++i) { - if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist)) + // Search through DAG. We can stop early if we find a store node. + for (unsigned i = 0; i < NumStores; ++i) { + if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist, + Max)) + return false; + // Check if we ended early, failing conservatively if so. + if (Visited.size() >= Max) return false; } return true; } -bool DAGCombiner::MergeConsecutiveStores( - StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes) { +bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { if (OptLevel == CodeGenOpt::None) return false; EVT MemVT = St->getMemoryVT(); int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8; + + if (MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits) + return false; + bool NoVectors = DAG.getMachineFunction().getFunction()->hasFnAttribute( Attribute::NoImplicitFloat); @@ -11731,376 +12681,437 @@ bool DAGCombiner::MergeConsecutiveStores( if (MemVT.isVector() && IsLoadSrc) return false; - // Only look at ends of store sequences. - SDValue Chain = SDValue(St, 0); - if (Chain->hasOneUse() && Chain->use_begin()->getOpcode() == ISD::STORE) - return false; - - // Save the LoadSDNodes that we find in the chain. - // We need to make sure that these nodes do not interfere with - // any of the store nodes. - SmallVector<LSBaseSDNode*, 8> AliasLoadNodes; - - getStoreMergeAndAliasCandidates(St, StoreNodes, AliasLoadNodes); + SmallVector<MemOpLink, 8> StoreNodes; + // Find potential store merge candidates by searching through chain sub-DAG + getStoreMergeCandidates(St, StoreNodes); // Check if there is anything to merge. if (StoreNodes.size() < 2) return false; - // only do dependence check in AA case - bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA - : DAG.getSubtarget().useAA(); - if (UseAA && !checkMergeStoreCandidatesForDependencies(StoreNodes)) - return false; - // Sort the memory operands according to their distance from the - // base pointer. As a secondary criteria: make sure stores coming - // later in the code come first in the list. This is important for - // the non-UseAA case, because we're merging stores into the FINAL - // store along a chain which potentially contains aliasing stores. - // Thus, if there are multiple stores to the same address, the last - // one can be considered for merging but not the others. + // base pointer. std::sort(StoreNodes.begin(), StoreNodes.end(), [](MemOpLink LHS, MemOpLink RHS) { - return LHS.OffsetFromBase < RHS.OffsetFromBase || - (LHS.OffsetFromBase == RHS.OffsetFromBase && - LHS.SequenceNum < RHS.SequenceNum); - }); - - // Scan the memory operations on the chain and find the first non-consecutive - // store memory address. - unsigned LastConsecutiveStore = 0; - int64_t StartAddress = StoreNodes[0].OffsetFromBase; - for (unsigned i = 0, e = StoreNodes.size(); i < e; ++i) { - + return LHS.OffsetFromBase < RHS.OffsetFromBase; + }); + + // Store Merge attempts to merge the lowest stores. This generally + // works out as if successful, as the remaining stores are checked + // after the first collection of stores is merged. However, in the + // case that a non-mergeable store is found first, e.g., {p[-2], + // p[0], p[1], p[2], p[3]}, we would fail and miss the subsequent + // mergeable cases. To prevent this, we prune such stores from the + // front of StoreNodes here. + + bool RV = false; + while (StoreNodes.size() > 1) { + unsigned StartIdx = 0; + while ((StartIdx + 1 < StoreNodes.size()) && + StoreNodes[StartIdx].OffsetFromBase + ElementSizeBytes != + StoreNodes[StartIdx + 1].OffsetFromBase) + ++StartIdx; + + // Bail if we don't have enough candidates to merge. + if (StartIdx + 1 >= StoreNodes.size()) + return RV; + + if (StartIdx) + StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + StartIdx); + + // Scan the memory operations on the chain and find the first + // non-consecutive store memory address. + unsigned NumConsecutiveStores = 1; + int64_t StartAddress = StoreNodes[0].OffsetFromBase; // Check that the addresses are consecutive starting from the second // element in the list of stores. - if (i > 0) { + for (unsigned i = 1, e = StoreNodes.size(); i < e; ++i) { int64_t CurrAddress = StoreNodes[i].OffsetFromBase; if (CurrAddress - StartAddress != (ElementSizeBytes * i)) break; + NumConsecutiveStores = i + 1; } - // Check if this store interferes with any of the loads that we found. - // If we find a load that alias with this store. Stop the sequence. - if (any_of(AliasLoadNodes, [&](LSBaseSDNode *Ldn) { - return isAlias(Ldn, StoreNodes[i].MemNode); - })) - break; + if (NumConsecutiveStores < 2) { + StoreNodes.erase(StoreNodes.begin(), + StoreNodes.begin() + NumConsecutiveStores); + continue; + } - // Mark this node as useful. - LastConsecutiveStore = i; - } + // Check that we can merge these candidates without causing a cycle + if (!checkMergeStoreCandidatesForDependencies(StoreNodes, + NumConsecutiveStores)) { + StoreNodes.erase(StoreNodes.begin(), + StoreNodes.begin() + NumConsecutiveStores); + continue; + } - // The node with the lowest store address. - LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; - unsigned FirstStoreAS = FirstInChain->getAddressSpace(); - unsigned FirstStoreAlign = FirstInChain->getAlignment(); - LLVMContext &Context = *DAG.getContext(); - const DataLayout &DL = DAG.getDataLayout(); - - // Store the constants into memory as one consecutive store. - if (IsConstantSrc) { - unsigned LastLegalType = 0; - unsigned LastLegalVectorType = 0; - bool NonZero = false; - for (unsigned i=0; i<LastConsecutiveStore+1; ++i) { - StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); - SDValue StoredVal = St->getValue(); - - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) { - NonZero |= !C->isNullValue(); - } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal)) { - NonZero |= !C->getConstantFPValue()->isNullValue(); - } else { - // Non-constant. - break; - } + // The node with the lowest store address. + LLVMContext &Context = *DAG.getContext(); + const DataLayout &DL = DAG.getDataLayout(); - // Find a legal type for the constant store. - unsigned SizeInBits = (i+1) * ElementSizeBytes * 8; - EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits); - bool IsFast; - if (TLI.isTypeLegal(StoreTy) && - TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, - FirstStoreAlign, &IsFast) && IsFast) { - LastLegalType = i+1; - // Or check whether a truncstore is legal. - } else if (TLI.getTypeAction(Context, StoreTy) == - TargetLowering::TypePromoteInteger) { - EVT LegalizedStoredValueTy = - TLI.getTypeToTransformTo(Context, StoredVal.getValueType()); - if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && - TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, - FirstStoreAS, FirstStoreAlign, &IsFast) && + // Store the constants into memory as one consecutive store. + if (IsConstantSrc) { + LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; + unsigned FirstStoreAS = FirstInChain->getAddressSpace(); + unsigned FirstStoreAlign = FirstInChain->getAlignment(); + unsigned LastLegalType = 1; + unsigned LastLegalVectorType = 1; + bool LastIntegerTrunc = false; + bool NonZero = false; + for (unsigned i = 0; i < NumConsecutiveStores; ++i) { + StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode); + SDValue StoredVal = ST->getValue(); + + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) { + NonZero |= !C->isNullValue(); + } else if (ConstantFPSDNode *C = + dyn_cast<ConstantFPSDNode>(StoredVal)) { + NonZero |= !C->getConstantFPValue()->isNullValue(); + } else { + // Non-constant. + break; + } + + // Find a legal type for the constant store. + unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8; + EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits); + bool IsFast = false; + if (TLI.isTypeLegal(StoreTy) && + TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) && + TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, + FirstStoreAlign, &IsFast) && IsFast) { + LastIntegerTrunc = false; LastLegalType = i + 1; + // Or check whether a truncstore is legal. + } else if (TLI.getTypeAction(Context, StoreTy) == + TargetLowering::TypePromoteInteger) { + EVT LegalizedStoredValueTy = + TLI.getTypeToTransformTo(Context, StoredVal.getValueType()); + if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && + TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValueTy, DAG) && + TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, + FirstStoreAS, FirstStoreAlign, &IsFast) && + IsFast) { + LastIntegerTrunc = true; + LastLegalType = i + 1; + } } - } - // We only use vectors if the constant is known to be zero or the target - // allows it and the function is not marked with the noimplicitfloat - // attribute. - if ((!NonZero || TLI.storeOfVectorConstantIsCheap(MemVT, i+1, - FirstStoreAS)) && - !NoVectors) { - // Find a legal type for the vector store. - EVT Ty = EVT::getVectorVT(Context, MemVT, i+1); - if (TLI.isTypeLegal(Ty) && - TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, - FirstStoreAlign, &IsFast) && IsFast) - LastLegalVectorType = i + 1; + // We only use vectors if the constant is known to be zero or the target + // allows it and the function is not marked with the noimplicitfloat + // attribute. + if ((!NonZero || + TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) && + !NoVectors) { + // Find a legal type for the vector store. + unsigned Elts = i + 1; + if (MemVT.isVector()) { + // When merging vector stores, get the total number of elements. + Elts *= MemVT.getVectorNumElements(); + } + EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); + if (TLI.isTypeLegal(Ty) && + TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) && + TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, + FirstStoreAlign, &IsFast) && + IsFast) + LastLegalVectorType = i + 1; + } } - } - // Check if we found a legal integer type to store. - if (LastLegalType == 0 && LastLegalVectorType == 0) - return false; - - bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors; - unsigned NumElem = UseVector ? LastLegalVectorType : LastLegalType; - - return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem, - true, UseVector); - } + // Check if we found a legal integer type that creates a meaningful merge. + if (LastLegalType < 2 && LastLegalVectorType < 2) { + StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1); + continue; + } - // When extracting multiple vector elements, try to store them - // in one vector store rather than a sequence of scalar stores. - if (IsExtractVecSrc) { - unsigned NumStoresToMerge = 0; - bool IsVec = MemVT.isVector(); - for (unsigned i = 0; i < LastConsecutiveStore + 1; ++i) { - StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); - unsigned StoreValOpcode = St->getValue().getOpcode(); - // This restriction could be loosened. - // Bail out if any stored values are not elements extracted from a vector. - // It should be possible to handle mixed sources, but load sources need - // more careful handling (see the block of code below that handles - // consecutive loads). - if (StoreValOpcode != ISD::EXTRACT_VECTOR_ELT && - StoreValOpcode != ISD::EXTRACT_SUBVECTOR) - return false; + bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors; + unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType; - // Find a legal type for the vector store. - unsigned Elts = i + 1; - if (IsVec) { - // When merging vector stores, get the total number of elements. - Elts *= MemVT.getVectorNumElements(); + bool Merged = MergeStoresOfConstantsOrVecElts( + StoreNodes, MemVT, NumElem, true, UseVector, LastIntegerTrunc); + if (!Merged) { + StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); + continue; } - EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); - bool IsFast; - if (TLI.isTypeLegal(Ty) && - TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, - FirstStoreAlign, &IsFast) && IsFast) - NumStoresToMerge = i + 1; + // Remove merged stores for next iteration. + RV = true; + StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); + continue; } - return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumStoresToMerge, - false, true); - } - - // Below we handle the case of multiple consecutive stores that - // come from multiple consecutive loads. We merge them into a single - // wide load and a single wide store. - - // Look for load nodes which are used by the stored values. - SmallVector<MemOpLink, 8> LoadNodes; + // When extracting multiple vector elements, try to store them + // in one vector store rather than a sequence of scalar stores. + if (IsExtractVecSrc) { + LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; + unsigned FirstStoreAS = FirstInChain->getAddressSpace(); + unsigned FirstStoreAlign = FirstInChain->getAlignment(); + unsigned NumStoresToMerge = 1; + bool IsVec = MemVT.isVector(); + for (unsigned i = 0; i < NumConsecutiveStores; ++i) { + StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); + unsigned StoreValOpcode = St->getValue().getOpcode(); + // This restriction could be loosened. + // Bail out if any stored values are not elements extracted from a + // vector. It should be possible to handle mixed sources, but load + // sources need more careful handling (see the block of code below that + // handles consecutive loads). + if (StoreValOpcode != ISD::EXTRACT_VECTOR_ELT && + StoreValOpcode != ISD::EXTRACT_SUBVECTOR) + return RV; - // Find acceptable loads. Loads need to have the same chain (token factor), - // must not be zext, volatile, indexed, and they must be consecutive. - BaseIndexOffset LdBasePtr; - for (unsigned i=0; i<LastConsecutiveStore+1; ++i) { - StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); - LoadSDNode *Ld = dyn_cast<LoadSDNode>(St->getValue()); - if (!Ld) break; + // Find a legal type for the vector store. + unsigned Elts = i + 1; + if (IsVec) { + // When merging vector stores, get the total number of elements. + Elts *= MemVT.getVectorNumElements(); + } + EVT Ty = + EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); + bool IsFast; + if (TLI.isTypeLegal(Ty) && + TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) && + TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, + FirstStoreAlign, &IsFast) && + IsFast) + NumStoresToMerge = i + 1; + } - // Loads must only have one use. - if (!Ld->hasNUsesOfValue(1, 0)) - break; + bool Merged = MergeStoresOfConstantsOrVecElts( + StoreNodes, MemVT, NumStoresToMerge, false, true, false); + if (!Merged) { + StoreNodes.erase(StoreNodes.begin(), + StoreNodes.begin() + NumStoresToMerge); + continue; + } + // Remove merged stores for next iteration. + StoreNodes.erase(StoreNodes.begin(), + StoreNodes.begin() + NumStoresToMerge); + RV = true; + continue; + } - // The memory operands must not be volatile. - if (Ld->isVolatile() || Ld->isIndexed()) - break; + // Below we handle the case of multiple consecutive stores that + // come from multiple consecutive loads. We merge them into a single + // wide load and a single wide store. - // We do not accept ext loads. - if (Ld->getExtensionType() != ISD::NON_EXTLOAD) - break; + // Look for load nodes which are used by the stored values. + SmallVector<MemOpLink, 8> LoadNodes; - // The stored memory type must be the same. - if (Ld->getMemoryVT() != MemVT) - break; + // Find acceptable loads. Loads need to have the same chain (token factor), + // must not be zext, volatile, indexed, and they must be consecutive. + BaseIndexOffset LdBasePtr; + for (unsigned i = 0; i < NumConsecutiveStores; ++i) { + StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); + LoadSDNode *Ld = dyn_cast<LoadSDNode>(St->getValue()); + if (!Ld) + break; - BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG); - // If this is not the first ptr that we check. - if (LdBasePtr.Base.getNode()) { - // The base ptr must be the same. - if (!LdPtr.equalBaseIndex(LdBasePtr)) + // Loads must only have one use. + if (!Ld->hasNUsesOfValue(1, 0)) break; - } else { - // Check that all other base pointers are the same as this one. - LdBasePtr = LdPtr; - } - // We found a potential memory operand to merge. - LoadNodes.push_back(MemOpLink(Ld, LdPtr.Offset, 0)); - } + // The memory operands must not be volatile. + if (Ld->isVolatile() || Ld->isIndexed()) + break; - if (LoadNodes.size() < 2) - return false; + // We do not accept ext loads. + if (Ld->getExtensionType() != ISD::NON_EXTLOAD) + break; - // If we have load/store pair instructions and we only have two values, - // don't bother. - unsigned RequiredAlignment; - if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) && - St->getAlignment() >= RequiredAlignment) - return false; + // The stored memory type must be the same. + if (Ld->getMemoryVT() != MemVT) + break; - LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode); - unsigned FirstLoadAS = FirstLoad->getAddressSpace(); - unsigned FirstLoadAlign = FirstLoad->getAlignment(); - - // Scan the memory operations on the chain and find the first non-consecutive - // load memory address. These variables hold the index in the store node - // array. - unsigned LastConsecutiveLoad = 0; - // This variable refers to the size and not index in the array. - unsigned LastLegalVectorType = 0; - unsigned LastLegalIntegerType = 0; - StartAddress = LoadNodes[0].OffsetFromBase; - SDValue FirstChain = FirstLoad->getChain(); - for (unsigned i = 1; i < LoadNodes.size(); ++i) { - // All loads must share the same chain. - if (LoadNodes[i].MemNode->getChain() != FirstChain) - break; + BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG); + // If this is not the first ptr that we check. + int64_t LdOffset = 0; + if (LdBasePtr.getBase().getNode()) { + // The base ptr must be the same. + if (!LdBasePtr.equalBaseIndex(LdPtr, DAG, LdOffset)) + break; + } else { + // Check that all other base pointers are the same as this one. + LdBasePtr = LdPtr; + } - int64_t CurrAddress = LoadNodes[i].OffsetFromBase; - if (CurrAddress - StartAddress != (ElementSizeBytes * i)) - break; - LastConsecutiveLoad = i; - // Find a legal type for the vector store. - EVT StoreTy = EVT::getVectorVT(Context, MemVT, i+1); - bool IsFastSt, IsFastLd; - if (TLI.isTypeLegal(StoreTy) && - TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, - FirstStoreAlign, &IsFastSt) && IsFastSt && - TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, - FirstLoadAlign, &IsFastLd) && IsFastLd) { - LastLegalVectorType = i + 1; - } - - // Find a legal type for the integer store. - unsigned SizeInBits = (i+1) * ElementSizeBytes * 8; - StoreTy = EVT::getIntegerVT(Context, SizeInBits); - if (TLI.isTypeLegal(StoreTy) && - TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, - FirstStoreAlign, &IsFastSt) && IsFastSt && - TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, - FirstLoadAlign, &IsFastLd) && IsFastLd) - LastLegalIntegerType = i + 1; - // Or check whether a truncstore and extload is legal. - else if (TLI.getTypeAction(Context, StoreTy) == - TargetLowering::TypePromoteInteger) { - EVT LegalizedStoredValueTy = - TLI.getTypeToTransformTo(Context, StoreTy); - if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && - TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy, StoreTy) && - TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy, StoreTy) && - TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) && - TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, - FirstStoreAS, FirstStoreAlign, &IsFastSt) && - IsFastSt && - TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, - FirstLoadAS, FirstLoadAlign, &IsFastLd) && - IsFastLd) - LastLegalIntegerType = i+1; + // We found a potential memory operand to merge. + LoadNodes.push_back(MemOpLink(Ld, LdOffset)); } - } - - // Only use vector types if the vector type is larger than the integer type. - // If they are the same, use integers. - bool UseVectorTy = LastLegalVectorType > LastLegalIntegerType && !NoVectors; - unsigned LastLegalType = std::max(LastLegalVectorType, LastLegalIntegerType); - // We add +1 here because the LastXXX variables refer to location while - // the NumElem refers to array/index size. - unsigned NumElem = std::min(LastConsecutiveStore, LastConsecutiveLoad) + 1; - NumElem = std::min(LastLegalType, NumElem); - - if (NumElem < 2) - return false; - - // Collect the chains from all merged stores. - SmallVector<SDValue, 8> MergeStoreChains; - MergeStoreChains.push_back(StoreNodes[0].MemNode->getChain()); + if (LoadNodes.size() < 2) { + StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1); + continue; + } - // The latest Node in the DAG. - unsigned LatestNodeUsed = 0; - for (unsigned i=1; i<NumElem; ++i) { - // Find a chain for the new wide-store operand. Notice that some - // of the store nodes that we found may not be selected for inclusion - // in the wide store. The chain we use needs to be the chain of the - // latest store node which is *used* and replaced by the wide store. - if (StoreNodes[i].SequenceNum < StoreNodes[LatestNodeUsed].SequenceNum) - LatestNodeUsed = i; + // If we have load/store pair instructions and we only have two values, + // don't bother merging. + unsigned RequiredAlignment; + if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) && + StoreNodes[0].MemNode->getAlignment() >= RequiredAlignment) { + StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2); + continue; + } + LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; + unsigned FirstStoreAS = FirstInChain->getAddressSpace(); + unsigned FirstStoreAlign = FirstInChain->getAlignment(); + LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode); + unsigned FirstLoadAS = FirstLoad->getAddressSpace(); + unsigned FirstLoadAlign = FirstLoad->getAlignment(); + + // Scan the memory operations on the chain and find the first + // non-consecutive load memory address. These variables hold the index in + // the store node array. + unsigned LastConsecutiveLoad = 1; + // This variable refers to the size and not index in the array. + unsigned LastLegalVectorType = 1; + unsigned LastLegalIntegerType = 1; + bool isDereferenceable = true; + bool DoIntegerTruncate = false; + StartAddress = LoadNodes[0].OffsetFromBase; + SDValue FirstChain = FirstLoad->getChain(); + for (unsigned i = 1; i < LoadNodes.size(); ++i) { + // All loads must share the same chain. + if (LoadNodes[i].MemNode->getChain() != FirstChain) + break; - MergeStoreChains.push_back(StoreNodes[i].MemNode->getChain()); - } + int64_t CurrAddress = LoadNodes[i].OffsetFromBase; + if (CurrAddress - StartAddress != (ElementSizeBytes * i)) + break; + LastConsecutiveLoad = i; - LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode; + if (isDereferenceable && !LoadNodes[i].MemNode->isDereferenceable()) + isDereferenceable = false; - // Find if it is better to use vectors or integers to load and store - // to memory. - EVT JointMemOpVT; - if (UseVectorTy) { - JointMemOpVT = EVT::getVectorVT(Context, MemVT, NumElem); - } else { - unsigned SizeInBits = NumElem * ElementSizeBytes * 8; - JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits); - } + // Find a legal type for the vector store. + EVT StoreTy = EVT::getVectorVT(Context, MemVT, i + 1); + bool IsFastSt, IsFastLd; + if (TLI.isTypeLegal(StoreTy) && + TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) && + TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, + FirstStoreAlign, &IsFastSt) && + IsFastSt && + TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, + FirstLoadAlign, &IsFastLd) && + IsFastLd) { + LastLegalVectorType = i + 1; + } - SDLoc LoadDL(LoadNodes[0].MemNode); - SDLoc StoreDL(StoreNodes[0].MemNode); + // Find a legal type for the integer store. + unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8; + StoreTy = EVT::getIntegerVT(Context, SizeInBits); + if (TLI.isTypeLegal(StoreTy) && + TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) && + TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, + FirstStoreAlign, &IsFastSt) && + IsFastSt && + TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, + FirstLoadAlign, &IsFastLd) && + IsFastLd) { + LastLegalIntegerType = i + 1; + DoIntegerTruncate = false; + // Or check whether a truncstore and extload is legal. + } else if (TLI.getTypeAction(Context, StoreTy) == + TargetLowering::TypePromoteInteger) { + EVT LegalizedStoredValueTy = TLI.getTypeToTransformTo(Context, StoreTy); + if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && + TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValueTy, DAG) && + TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy, + StoreTy) && + TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy, + StoreTy) && + TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) && + TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, + FirstStoreAS, FirstStoreAlign, &IsFastSt) && + IsFastSt && + TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, + FirstLoadAlign, &IsFastLd) && + IsFastLd) { + LastLegalIntegerType = i + 1; + DoIntegerTruncate = true; + } + } + } - // The merged loads are required to have the same incoming chain, so - // using the first's chain is acceptable. - SDValue NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(), - FirstLoad->getBasePtr(), - FirstLoad->getPointerInfo(), FirstLoadAlign); + // Only use vector types if the vector type is larger than the integer type. + // If they are the same, use integers. + bool UseVectorTy = LastLegalVectorType > LastLegalIntegerType && !NoVectors; + unsigned LastLegalType = + std::max(LastLegalVectorType, LastLegalIntegerType); - SDValue NewStoreChain = - DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, MergeStoreChains); + // We add +1 here because the LastXXX variables refer to location while + // the NumElem refers to array/index size. + unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1); + NumElem = std::min(LastLegalType, NumElem); - SDValue NewStore = - DAG.getStore(NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), - FirstInChain->getPointerInfo(), FirstStoreAlign); + if (NumElem < 2) { + StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1); + continue; + } - // Transfer chain users from old loads to the new load. - for (unsigned i = 0; i < NumElem; ++i) { - LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode); - DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), - SDValue(NewLoad.getNode(), 1)); - } + // Find if it is better to use vectors or integers to load and store + // to memory. + EVT JointMemOpVT; + if (UseVectorTy) { + JointMemOpVT = EVT::getVectorVT(Context, MemVT, NumElem); + } else { + unsigned SizeInBits = NumElem * ElementSizeBytes * 8; + JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits); + } + + SDLoc LoadDL(LoadNodes[0].MemNode); + SDLoc StoreDL(StoreNodes[0].MemNode); + + // The merged loads are required to have the same incoming chain, so + // using the first's chain is acceptable. + + SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem); + AddToWorklist(NewStoreChain.getNode()); + + MachineMemOperand::Flags MMOFlags = isDereferenceable ? + MachineMemOperand::MODereferenceable: + MachineMemOperand::MONone; + + SDValue NewLoad, NewStore; + if (UseVectorTy || !DoIntegerTruncate) { + NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(), + FirstLoad->getBasePtr(), + FirstLoad->getPointerInfo(), FirstLoadAlign, + MMOFlags); + NewStore = DAG.getStore(NewStoreChain, StoreDL, NewLoad, + FirstInChain->getBasePtr(), + FirstInChain->getPointerInfo(), FirstStoreAlign); + } else { // This must be the truncstore/extload case + EVT ExtendedTy = + TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT); + NewLoad = + DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy, FirstLoad->getChain(), + FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), + JointMemOpVT, FirstLoadAlign, MMOFlags); + NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad, + FirstInChain->getBasePtr(), + FirstInChain->getPointerInfo(), JointMemOpVT, + FirstInChain->getAlignment(), + FirstInChain->getMemOperand()->getFlags()); + } + + // Transfer chain users from old loads to the new load. + for (unsigned i = 0; i < NumElem; ++i) { + LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), + SDValue(NewLoad.getNode(), 1)); + } - if (UseAA) { // Replace the all stores with the new store. for (unsigned i = 0; i < NumElem; ++i) CombineTo(StoreNodes[i].MemNode, NewStore); - } else { - // Replace the last store with the new store. - CombineTo(LatestOp, NewStore); - // Erase all other stores. - for (unsigned i = 0; i < NumElem; ++i) { - // Remove all Store nodes. - if (StoreNodes[i].MemNode == LatestOp) - continue; - StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); - DAG.ReplaceAllUsesOfValueWith(SDValue(St, 0), St->getChain()); - deleteAndRecombine(St); - } + RV = true; + StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); + continue; } - - StoreNodes.erase(StoreNodes.begin() + NumElem, StoreNodes.end()); - return true; + return RV; } SDValue DAGCombiner::replaceStoreChain(StoreSDNode *ST, SDValue BetterChain) { @@ -12256,19 +13267,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { if (SDValue NewST = TransformFPLoadStorePair(N)) return NewST; - bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA - : DAG.getSubtarget().useAA(); -#ifndef NDEBUG - if (CombinerAAOnlyFunc.getNumOccurrences() && - CombinerAAOnlyFunc != DAG.getMachineFunction().getName()) - UseAA = false; -#endif - if (UseAA && ST->isUnindexed()) { - // FIXME: We should do this even without AA enabled. AA will just allow - // FindBetterChain to work in more situations. The problem with this is that - // any combine that expects memory operations to be on consecutive chains - // first needs to be updated to look for users of the same chain. - + if (ST->isUnindexed()) { // Walk up chain skipping non-aliasing memory nodes, on this store and any // adjacent stores. if (findBetterNeighborChains(ST)) { @@ -12279,10 +13278,6 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { Chain = ST->getChain(); } - // Try transforming N to an indexed store. - if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) - return SDValue(N, 0); - // FIXME: is there such a thing as a truncating indexed store? if (ST->isTruncatingStore() && ST->isUnindexed() && Value.getValueType().isInteger()) { @@ -12302,8 +13297,15 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { if (SimplifyDemandedBits( Value, APInt::getLowBitsSet(Value.getScalarValueSizeInBits(), - ST->getMemoryVT().getScalarSizeInBits()))) + ST->getMemoryVT().getScalarSizeInBits()))) { + // Re-visit the store if anything changed and the store hasn't been merged + // with another node (N is deleted) SimplifyDemandedBits will add Value's + // node back to the worklist if necessary, but we also need to re-visit + // the Store node itself. + if (N->getOpcode() != ISD::DELETED_NODE) + AddToWorklist(N); return SDValue(N, 0); + } } // If this is a load followed by a store to the same location, then the store @@ -12319,14 +13321,28 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { } } - // If this is a store followed by a store with the same value to the same - // location, then the store is dead/noop. if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) { - if (ST1->getBasePtr() == Ptr && ST->getMemoryVT() == ST1->getMemoryVT() && - ST1->getValue() == Value && ST->isUnindexed() && !ST->isVolatile() && - ST1->isUnindexed() && !ST1->isVolatile()) { - // The store is dead, remove it. - return Chain; + if (ST->isUnindexed() && !ST->isVolatile() && ST1->isUnindexed() && + !ST1->isVolatile() && ST1->getBasePtr() == Ptr && + ST->getMemoryVT() == ST1->getMemoryVT()) { + // If this is a store followed by a store with the same value to the same + // location, then the store is dead/noop. + if (ST1->getValue() == Value) { + // The store is dead, remove it. + return Chain; + } + + // If this is a store who's preceeding store to the same location + // and no one other node is chained to that store we can effectively + // drop the store. Do not remove stores to undef as they may be used as + // data sinks. + if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() && + !ST1->getBasePtr().isUndef()) { + // ST1 is fully overwritten and can be elided. Combine with it's chain + // value. + CombineTo(ST1, ST1->getChain()); + return SDValue(); + } } } @@ -12342,29 +13358,31 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { // Only perform this optimization before the types are legal, because we // don't want to perform this optimization on every DAGCombine invocation. - if (!LegalTypes) { + if ((TLI.mergeStoresAfterLegalization()) ? Level == AfterLegalizeDAG + : !LegalTypes) { for (;;) { // There can be multiple store sequences on the same chain. // Keep trying to merge store sequences until we are unable to do so // or until we merge the last store on the chain. - SmallVector<MemOpLink, 8> StoreNodes; - bool Changed = MergeConsecutiveStores(ST, StoreNodes); + bool Changed = MergeConsecutiveStores(ST); if (!Changed) break; - - if (any_of(StoreNodes, - [ST](const MemOpLink &Link) { return Link.MemNode == ST; })) { - // ST has been merged and no longer exists. + // Return N as merge only uses CombineTo and no worklist clean + // up is necessary. + if (N->getOpcode() == ISD::DELETED_NODE || !isa<StoreSDNode>(N)) return SDValue(N, 0); - } } } + // Try transforming N to an indexed store. + if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) + return SDValue(N, 0); + // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr' // // Make sure to do this only after attempting to merge stores in order to // avoid changing the types of some subset of stores due to visit order, // preventing their merging. - if (isa<ConstantFPSDNode>(Value)) { + if (isa<ConstantFPSDNode>(ST->getValue())) { if (SDValue NewSt = replaceStoreOfFPConstant(ST)) return NewSt; } @@ -12493,10 +13511,6 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { EVT VT = InVec.getValueType(); - // If we can't generate a legal BUILD_VECTOR, exit - if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) - return SDValue(); - // Check that we know which element is being inserted if (!isa<ConstantSDNode>(EltNo)) return SDValue(); @@ -12511,8 +13525,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { // do this only if indices are both constants and Idx1 < Idx0. if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse() && isa<ConstantSDNode>(InVec.getOperand(2))) { - unsigned OtherElt = - cast<ConstantSDNode>(InVec.getOperand(2))->getZExtValue(); + unsigned OtherElt = InVec.getConstantOperandVal(2); if (Elt < OtherElt) { // Swap nodes. SDValue NewOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, @@ -12523,6 +13536,10 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { } } + // If we can't generate a legal BUILD_VECTOR, exit + if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) + return SDValue(); + // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially // be converted to a BUILD_VECTOR). Fill in the Ops vector with the // vector elements. @@ -12544,11 +13561,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { // All the operands of BUILD_VECTOR must have the same type; // we enforce that here. EVT OpVT = Ops[0].getValueType(); - if (InVal.getValueType() != OpVT) - InVal = OpVT.bitsGT(InVal.getValueType()) ? - DAG.getNode(ISD::ANY_EXTEND, DL, OpVT, InVal) : - DAG.getNode(ISD::TRUNCATE, DL, OpVT, InVal); - Ops[Elt] = InVal; + Ops[Elt] = OpVT.isInteger() ? DAG.getAnyExtOrTrunc(InVal, DL, OpVT) : InVal; } // Return the new vector @@ -12568,6 +13581,11 @@ SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad( if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT)) return SDValue(); + ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ? + ISD::NON_EXTLOAD : ISD::EXTLOAD; + if (!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT)) + return SDValue(); + Align = NewAlign; SDValue NewPtr = OriginalLoad->getBasePtr(); @@ -12639,6 +13657,9 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { EVT VT = InVec.getValueType(); EVT NVT = N->getValueType(0); + if (InVec.isUndef()) + return DAG.getUNDEF(NVT); + if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { // Check if the result type doesn't match the inserted element type. A // SCALAR_TO_VECTOR may truncate the inserted element and the @@ -13022,7 +14043,7 @@ SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) { return DAG.getNode(Opcode, DL, VT, BV); } -SDValue DAGCombiner::createBuildVecShuffle(SDLoc DL, SDNode *N, +SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N, ArrayRef<int> VectorMask, SDValue VecIn1, SDValue VecIn2, unsigned LeftIdx) { @@ -13088,6 +14109,11 @@ SDValue DAGCombiner::createBuildVecShuffle(SDLoc DL, SDNode *N, // when we start sorting the vectors by type. return SDValue(); } + } else if (InVT2.getSizeInBits() * 2 == VT.getSizeInBits() && + InVT1.getSizeInBits() == VT.getSizeInBits()) { + SmallVector<SDValue, 2> ConcatOps(2, DAG.getUNDEF(InVT2)); + ConcatOps[0] = VecIn2; + VecIn2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps); } else { // TODO: Support cases where the length mismatch isn't exactly by a // factor of 2. @@ -13293,6 +14319,73 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { return Shuffles[0]; } +// Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT +// operations which can be matched to a truncate. +SDValue DAGCombiner::reduceBuildVecToTrunc(SDNode *N) { + // TODO: Add support for big-endian. + if (DAG.getDataLayout().isBigEndian()) + return SDValue(); + if (N->getNumOperands() < 2) + return SDValue(); + SDLoc DL(N); + EVT VT = N->getValueType(0); + unsigned NumElems = N->getNumOperands(); + + if (!isTypeLegal(VT)) + return SDValue(); + + // If the input is something other than an EXTRACT_VECTOR_ELT with a constant + // index, bail out. + // TODO: Allow undef elements in some cases? + if (any_of(N->ops(), [VT](SDValue Op) { + return Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa<ConstantSDNode>(Op.getOperand(1)) || + Op.getValueType() != VT.getVectorElementType(); + })) + return SDValue(); + + // Helper for obtaining an EXTRACT_VECTOR_ELT's constant index + auto GetExtractIdx = [](SDValue Extract) { + return cast<ConstantSDNode>(Extract.getOperand(1))->getSExtValue(); + }; + + // The first BUILD_VECTOR operand must be an an extract from index zero + // (assuming no undef and little-endian). + if (GetExtractIdx(N->getOperand(0)) != 0) + return SDValue(); + + // Compute the stride from the first index. + int Stride = GetExtractIdx(N->getOperand(1)); + SDValue ExtractedFromVec = N->getOperand(0).getOperand(0); + + // Proceed only if the stride and the types can be matched to a truncate. + if ((Stride == 1 || !isPowerOf2_32(Stride)) || + (ExtractedFromVec.getValueType().getVectorNumElements() != + Stride * NumElems) || + (VT.getScalarSizeInBits() * Stride > 64)) + return SDValue(); + + // Check remaining operands are consistent with the computed stride. + for (unsigned i = 1; i != NumElems; ++i) { + SDValue Op = N->getOperand(i); + + if ((Op.getOperand(0) != ExtractedFromVec) || + (GetExtractIdx(Op) != Stride * i)) + return SDValue(); + } + + // All checks were ok, construct the truncate. + LLVMContext &Ctx = *DAG.getContext(); + EVT NewVT = VT.getVectorVT( + Ctx, EVT::getIntegerVT(Ctx, VT.getScalarSizeInBits() * Stride), NumElems); + EVT TruncVT = + VT.isFloatingPoint() ? VT.changeVectorElementTypeToInteger() : VT; + + SDValue Res = DAG.getBitcast(NewVT, ExtractedFromVec); + Res = DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, Res); + return DAG.getBitcast(VT, Res); +} + SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { EVT VT = N->getValueType(0); @@ -13300,12 +14393,45 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { if (ISD::allOperandsUndef(N)) return DAG.getUNDEF(VT); + // Check if we can express BUILD VECTOR via subvector extract. + if (!LegalTypes && (N->getNumOperands() > 1)) { + SDValue Op0 = N->getOperand(0); + auto checkElem = [&](SDValue Op) -> uint64_t { + if ((Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) && + (Op0.getOperand(0) == Op.getOperand(0))) + if (auto CNode = dyn_cast<ConstantSDNode>(Op.getOperand(1))) + return CNode->getZExtValue(); + return -1; + }; + + int Offset = checkElem(Op0); + for (unsigned i = 0; i < N->getNumOperands(); ++i) { + if (Offset + i != checkElem(N->getOperand(i))) { + Offset = -1; + break; + } + } + + if ((Offset == 0) && + (Op0.getOperand(0).getValueType() == N->getValueType(0))) + return Op0.getOperand(0); + if ((Offset != -1) && + ((Offset % N->getValueType(0).getVectorNumElements()) == + 0)) // IDX must be multiple of output size. + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), N->getValueType(0), + Op0.getOperand(0), Op0.getOperand(1)); + } + if (SDValue V = reduceBuildVecExtToExtBuildVec(N)) return V; if (SDValue V = reduceBuildVecConvertToConvertBuildVec(N)) return V; + if (TLI.isDesirableToCombineBuildVectorToTruncate()) + if (SDValue V = reduceBuildVecToTrunc(N)) + return V; + if (SDValue V = reduceBuildVecToShuffle(N)) return V; @@ -13419,7 +14545,7 @@ static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) { if (!isa<ConstantSDNode>(Op.getOperand(1))) return SDValue(); - int ExtIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + int ExtIdx = Op.getConstantOperandVal(1); // Ensure that we are extracting a subvector from a vector the same // size as the result. @@ -13491,8 +14617,11 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { if (!SclTy.isFloatingPoint() && !SclTy.isInteger()) return SDValue(); - EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, - VT.getSizeInBits() / SclTy.getSizeInBits()); + unsigned VNTNumElms = VT.getSizeInBits() / SclTy.getSizeInBits(); + if (VNTNumElms < 2) + return SDValue(); + + EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, VNTNumElms); if (!TLI.isTypeLegal(NVT) || !TLI.isTypeLegal(Scalar.getValueType())) return SDValue(); @@ -13607,19 +14736,153 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { return SDValue(); } +/// If we are extracting a subvector produced by a wide binary operator with at +/// at least one operand that was the result of a vector concatenation, then try +/// to use the narrow vector operands directly to avoid the concatenation and +/// extraction. +static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) { + // TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share + // some of these bailouts with other transforms. + + // The extract index must be a constant, so we can map it to a concat operand. + auto *ExtractIndex = dyn_cast<ConstantSDNode>(Extract->getOperand(1)); + if (!ExtractIndex) + return SDValue(); + + // Only handle the case where we are doubling and then halving. A larger ratio + // may require more than two narrow binops to replace the wide binop. + EVT VT = Extract->getValueType(0); + unsigned NumElems = VT.getVectorNumElements(); + assert((ExtractIndex->getZExtValue() % NumElems) == 0 && + "Extract index is not a multiple of the vector length."); + if (Extract->getOperand(0).getValueSizeInBits() != VT.getSizeInBits() * 2) + return SDValue(); + + // We are looking for an optionally bitcasted wide vector binary operator + // feeding an extract subvector. + SDValue BinOp = Extract->getOperand(0); + if (BinOp.getOpcode() == ISD::BITCAST) + BinOp = BinOp.getOperand(0); + + // TODO: The motivating case for this transform is an x86 AVX1 target. That + // target has temptingly almost legal versions of bitwise logic ops in 256-bit + // flavors, but no other 256-bit integer support. This could be extended to + // handle any binop, but that may require fixing/adding other folds to avoid + // codegen regressions. + unsigned BOpcode = BinOp.getOpcode(); + if (BOpcode != ISD::AND && BOpcode != ISD::OR && BOpcode != ISD::XOR) + return SDValue(); + + // The binop must be a vector type, so we can chop it in half. + EVT WideBVT = BinOp.getValueType(); + if (!WideBVT.isVector()) + return SDValue(); + + // Bail out if the target does not support a narrower version of the binop. + EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(), + WideBVT.getVectorNumElements() / 2); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT)) + return SDValue(); + + // Peek through bitcasts of the binary operator operands if needed. + SDValue LHS = BinOp.getOperand(0); + if (LHS.getOpcode() == ISD::BITCAST) + LHS = LHS.getOperand(0); + + SDValue RHS = BinOp.getOperand(1); + if (RHS.getOpcode() == ISD::BITCAST) + RHS = RHS.getOperand(0); + + // We need at least one concatenation operation of a binop operand to make + // this transform worthwhile. The concat must double the input vector sizes. + // TODO: Should we also handle INSERT_SUBVECTOR patterns? + bool ConcatL = + LHS.getOpcode() == ISD::CONCAT_VECTORS && LHS.getNumOperands() == 2; + bool ConcatR = + RHS.getOpcode() == ISD::CONCAT_VECTORS && RHS.getNumOperands() == 2; + if (!ConcatL && !ConcatR) + return SDValue(); + + // If one of the binop operands was not the result of a concat, we must + // extract a half-sized operand for our new narrow binop. We can't just reuse + // the original extract index operand because we may have bitcasted. + unsigned ConcatOpNum = ExtractIndex->getZExtValue() / NumElems; + unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements(); + EVT ExtBOIdxVT = Extract->getOperand(1).getValueType(); + SDLoc DL(Extract); + + // extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN + // extract (binop (concat X1, X2), Y), N --> binop XN, (extract Y, N) + // extract (binop X, (concat Y1, Y2)), N --> binop (extract X, N), YN + SDValue X = ConcatL ? DAG.getBitcast(NarrowBVT, LHS.getOperand(ConcatOpNum)) + : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, + BinOp.getOperand(0), + DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT)); + + SDValue Y = ConcatR ? DAG.getBitcast(NarrowBVT, RHS.getOperand(ConcatOpNum)) + : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, + BinOp.getOperand(1), + DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT)); + + SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y); + return DAG.getBitcast(VT, NarrowBinOp); +} + +/// If we are extracting a subvector from a wide vector load, convert to a +/// narrow load to eliminate the extraction: +/// (extract_subvector (load wide vector)) --> (load narrow vector) +static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) { + // TODO: Add support for big-endian. The offset calculation must be adjusted. + if (DAG.getDataLayout().isBigEndian()) + return SDValue(); + + // TODO: The one-use check is overly conservative. Check the cost of the + // extract instead or remove that condition entirely. + auto *Ld = dyn_cast<LoadSDNode>(Extract->getOperand(0)); + auto *ExtIdx = dyn_cast<ConstantSDNode>(Extract->getOperand(1)); + if (!Ld || !Ld->hasOneUse() || Ld->getExtensionType() || Ld->isVolatile() || + !ExtIdx) + return SDValue(); + + // The narrow load will be offset from the base address of the old load if + // we are extracting from something besides index 0 (little-endian). + EVT VT = Extract->getValueType(0); + SDLoc DL(Extract); + SDValue BaseAddr = Ld->getOperand(1); + unsigned Offset = ExtIdx->getZExtValue() * VT.getScalarType().getStoreSize(); + + // TODO: Use "BaseIndexOffset" to make this more effective. + SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL); + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset, + VT.getStoreSize()); + SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO); + DAG.makeEquivalentMemoryOrdering(Ld, NewLd); + return NewLd; +} + SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { EVT NVT = N->getValueType(0); SDValue V = N->getOperand(0); - if (V->getOpcode() == ISD::CONCAT_VECTORS) { - // Combine: - // (extract_subvec (concat V1, V2, ...), i) - // Into: - // Vi if possible - // Only operand 0 is checked as 'concat' assumes all inputs of the same - // type. - if (V->getOperand(0).getValueType() != NVT) - return SDValue(); + // Extract from UNDEF is UNDEF. + if (V.isUndef()) + return DAG.getUNDEF(NVT); + + if (TLI.isOperationLegalOrCustomOrPromote(ISD::LOAD, NVT)) + if (SDValue NarrowLoad = narrowExtractedVectorLoad(N, DAG)) + return NarrowLoad; + + // Combine: + // (extract_subvec (concat V1, V2, ...), i) + // Into: + // Vi if possible + // Only operand 0 is checked as 'concat' assumes all inputs of the same + // type. + if (V->getOpcode() == ISD::CONCAT_VECTORS && + isa<ConstantSDNode>(N->getOperand(1)) && + V->getOperand(0).getValueType() == NVT) { unsigned Idx = N->getConstantOperandVal(1); unsigned NumElems = NVT.getVectorNumElements(); assert((Idx % NumElems) == 0 && @@ -13633,19 +14896,16 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { if (V->getOpcode() == ISD::INSERT_SUBVECTOR) { // Handle only simple case where vector being inserted and vector - // being extracted are of same type, and are half size of larger vectors. - EVT BigVT = V->getOperand(0).getValueType(); + // being extracted are of same size. EVT SmallVT = V->getOperand(1).getValueType(); - if (!NVT.bitsEq(SmallVT) || NVT.getSizeInBits()*2 != BigVT.getSizeInBits()) + if (!NVT.bitsEq(SmallVT)) return SDValue(); - // Only handle cases where both indexes are constants with the same type. + // Only handle cases where both indexes are constants. ConstantSDNode *ExtIdx = dyn_cast<ConstantSDNode>(N->getOperand(1)); ConstantSDNode *InsIdx = dyn_cast<ConstantSDNode>(V->getOperand(2)); - if (InsIdx && ExtIdx && - InsIdx->getValueType(0).getSizeInBits() <= 64 && - ExtIdx->getValueType(0).getSizeInBits() <= 64) { + if (InsIdx && ExtIdx) { // Combine: // (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx) // Into: @@ -13661,6 +14921,9 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { } } + if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG)) + return NarrowBOp; + return SDValue(); } @@ -13892,6 +15155,167 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN, return DAG.getBuildVector(VT, SDLoc(SVN), Ops); } +// Match shuffles that can be converted to any_vector_extend_in_reg. +// This is often generated during legalization. +// e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src)) +// TODO Add support for ZERO_EXTEND_VECTOR_INREG when we have a test case. +static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN, + SelectionDAG &DAG, + const TargetLowering &TLI, + bool LegalOperations) { + EVT VT = SVN->getValueType(0); + bool IsBigEndian = DAG.getDataLayout().isBigEndian(); + + // TODO Add support for big-endian when we have a test case. + if (!VT.isInteger() || IsBigEndian) + return SDValue(); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + ArrayRef<int> Mask = SVN->getMask(); + SDValue N0 = SVN->getOperand(0); + + // shuffle<0,-1,1,-1> == (v2i64 anyextend_vector_inreg(v4i32)) + auto isAnyExtend = [&Mask, &NumElts](unsigned Scale) { + for (unsigned i = 0; i != NumElts; ++i) { + if (Mask[i] < 0) + continue; + if ((i % Scale) == 0 && Mask[i] == (int)(i / Scale)) + continue; + return false; + } + return true; + }; + + // Attempt to match a '*_extend_vector_inreg' shuffle, we just search for + // power-of-2 extensions as they are the most likely. + for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) { + if (!isAnyExtend(Scale)) + continue; + + EVT OutSVT = EVT::getIntegerVT(*DAG.getContext(), EltSizeInBits * Scale); + EVT OutVT = EVT::getVectorVT(*DAG.getContext(), OutSVT, NumElts / Scale); + if (!LegalOperations || + TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT)) + return DAG.getBitcast(VT, + DAG.getAnyExtendVectorInReg(N0, SDLoc(SVN), OutVT)); + } + + return SDValue(); +} + +// Detect 'truncate_vector_inreg' style shuffles that pack the lower parts of +// each source element of a large type into the lowest elements of a smaller +// destination type. This is often generated during legalization. +// If the source node itself was a '*_extend_vector_inreg' node then we should +// then be able to remove it. +static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, + SelectionDAG &DAG) { + EVT VT = SVN->getValueType(0); + bool IsBigEndian = DAG.getDataLayout().isBigEndian(); + + // TODO Add support for big-endian when we have a test case. + if (!VT.isInteger() || IsBigEndian) + return SDValue(); + + SDValue N0 = SVN->getOperand(0); + while (N0.getOpcode() == ISD::BITCAST) + N0 = N0.getOperand(0); + + unsigned Opcode = N0.getOpcode(); + if (Opcode != ISD::ANY_EXTEND_VECTOR_INREG && + Opcode != ISD::SIGN_EXTEND_VECTOR_INREG && + Opcode != ISD::ZERO_EXTEND_VECTOR_INREG) + return SDValue(); + + SDValue N00 = N0.getOperand(0); + ArrayRef<int> Mask = SVN->getMask(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + unsigned ExtSrcSizeInBits = N00.getScalarValueSizeInBits(); + unsigned ExtDstSizeInBits = N0.getScalarValueSizeInBits(); + + if (ExtDstSizeInBits % ExtSrcSizeInBits != 0) + return SDValue(); + unsigned ExtScale = ExtDstSizeInBits / ExtSrcSizeInBits; + + // (v4i32 truncate_vector_inreg(v2i64)) == shuffle<0,2-1,-1> + // (v8i16 truncate_vector_inreg(v4i32)) == shuffle<0,2,4,6,-1,-1,-1,-1> + // (v8i16 truncate_vector_inreg(v2i64)) == shuffle<0,4,-1,-1,-1,-1,-1,-1> + auto isTruncate = [&Mask, &NumElts](unsigned Scale) { + for (unsigned i = 0; i != NumElts; ++i) { + if (Mask[i] < 0) + continue; + if ((i * Scale) < NumElts && Mask[i] == (int)(i * Scale)) + continue; + return false; + } + return true; + }; + + // At the moment we just handle the case where we've truncated back to the + // same size as before the extension. + // TODO: handle more extension/truncation cases as cases arise. + if (EltSizeInBits != ExtSrcSizeInBits) + return SDValue(); + + // We can remove *extend_vector_inreg only if the truncation happens at + // the same scale as the extension. + if (isTruncate(ExtScale)) + return DAG.getBitcast(VT, N00); + + return SDValue(); +} + +// Combine shuffles of splat-shuffles of the form: +// shuffle (shuffle V, undef, splat-mask), undef, M +// If splat-mask contains undef elements, we need to be careful about +// introducing undef's in the folded mask which are not the result of composing +// the masks of the shuffles. +static SDValue combineShuffleOfSplat(ArrayRef<int> UserMask, + ShuffleVectorSDNode *Splat, + SelectionDAG &DAG) { + ArrayRef<int> SplatMask = Splat->getMask(); + assert(UserMask.size() == SplatMask.size() && "Mask length mismatch"); + + // Prefer simplifying to the splat-shuffle, if possible. This is legal if + // every undef mask element in the splat-shuffle has a corresponding undef + // element in the user-shuffle's mask or if the composition of mask elements + // would result in undef. + // Examples for (shuffle (shuffle v, undef, SplatMask), undef, UserMask): + // * UserMask=[0,2,u,u], SplatMask=[2,u,2,u] -> [2,2,u,u] + // In this case it is not legal to simplify to the splat-shuffle because we + // may be exposing the users of the shuffle an undef element at index 1 + // which was not there before the combine. + // * UserMask=[0,u,2,u], SplatMask=[2,u,2,u] -> [2,u,2,u] + // In this case the composition of masks yields SplatMask, so it's ok to + // simplify to the splat-shuffle. + // * UserMask=[3,u,2,u], SplatMask=[2,u,2,u] -> [u,u,2,u] + // In this case the composed mask includes all undef elements of SplatMask + // and in addition sets element zero to undef. It is safe to simplify to + // the splat-shuffle. + auto CanSimplifyToExistingSplat = [](ArrayRef<int> UserMask, + ArrayRef<int> SplatMask) { + for (unsigned i = 0, e = UserMask.size(); i != e; ++i) + if (UserMask[i] != -1 && SplatMask[i] == -1 && + SplatMask[UserMask[i]] != -1) + return false; + return true; + }; + if (CanSimplifyToExistingSplat(UserMask, SplatMask)) + return SDValue(Splat, 0); + + // Create a new shuffle with a mask that is composed of the two shuffles' + // masks. + SmallVector<int, 32> NewMask; + for (int Idx : UserMask) + NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]); + + return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat), + Splat->getOperand(0), Splat->getOperand(1), + NewMask); +} + SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); @@ -13938,6 +15362,11 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask); } + // A shuffle of a single vector that is a splat can always be folded. + if (auto *N0Shuf = dyn_cast<ShuffleVectorSDNode>(N0)) + if (N1->isUndef() && N0Shuf->isSplat()) + return combineShuffleOfSplat(SVN->getMask(), N0Shuf, DAG); + // If it is a splat, check if the argument vector is another splat or a // build_vector. if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) { @@ -13996,6 +15425,14 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { if (SDValue S = simplifyShuffleOperands(SVN, N0, N1, DAG)) return S; + // Match shuffles that can be converted to any_vector_extend_in_reg. + if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations)) + return V; + + // Combine "truncate_vector_in_reg" style shuffles. + if (SDValue V = combineTruncationShuffle(SVN, DAG)) + return V; + if (N0.getOpcode() == ISD::CONCAT_VECTORS && Level < AfterLegalizeVectorOps && (N1.isUndef() || @@ -14253,6 +15690,16 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); + // If inserting an UNDEF, just return the original vector. + if (N1.isUndef()) + return N0; + + // If this is an insert of an extracted vector into an undef vector, we can + // just use the input to the extract. + if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && + N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT) + return N1.getOperand(0); + // Combine INSERT_SUBVECTORs where we are inserting to the same index. // INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx ) // --> INSERT_SUBVECTOR( Vec, SubNew, Idx ) @@ -14262,26 +15709,39 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0), N1, N2); - if (N0.getValueType() != N1.getValueType()) + if (!isa<ConstantSDNode>(N2)) return SDValue(); + unsigned InsIdx = cast<ConstantSDNode>(N2)->getZExtValue(); + + // Canonicalize insert_subvector dag nodes. + // Example: + // (insert_subvector (insert_subvector A, Idx0), Idx1) + // -> (insert_subvector (insert_subvector A, Idx1), Idx0) + if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.hasOneUse() && + N1.getValueType() == N0.getOperand(1).getValueType() && + isa<ConstantSDNode>(N0.getOperand(2))) { + unsigned OtherIdx = N0.getConstantOperandVal(2); + if (InsIdx < OtherIdx) { + // Swap nodes. + SDValue NewOp = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, + N0.getOperand(0), N1, N2); + AddToWorklist(NewOp.getNode()); + return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N0.getNode()), + VT, NewOp, N0.getOperand(1), N0.getOperand(2)); + } + } + // If the input vector is a concatenation, and the insert replaces - // one of the halves, we can optimize into a single concat_vectors. - if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0->getNumOperands() == 2 && - N2.getOpcode() == ISD::Constant) { - APInt InsIdx = cast<ConstantSDNode>(N2)->getAPIntValue(); - - // Lower half: fold (insert_subvector (concat_vectors X, Y), Z) -> - // (concat_vectors Z, Y) - if (InsIdx == 0) - return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N1, - N0.getOperand(1)); + // one of the pieces, we can optimize into a single concat_vectors. + if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() && + N0.getOperand(0).getValueType() == N1.getValueType()) { + unsigned Factor = N1.getValueType().getVectorNumElements(); + + SmallVector<SDValue, 8> Ops(N0->op_begin(), N0->op_end()); + Ops[cast<ConstantSDNode>(N2)->getZExtValue() / Factor] = N1; - // Upper half: fold (insert_subvector (concat_vectors X, Y), Z) -> - // (concat_vectors X, Z) - if (InsIdx == VT.getVectorNumElements() / 2) - return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0.getOperand(0), - N1); + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); } return SDValue(); @@ -14366,9 +15826,9 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) { // Extract the sub element from the constant bit mask. if (DAG.getDataLayout().isBigEndian()) { - Bits = Bits.lshr((Split - SubIdx - 1) * NumSubBits); + Bits.lshrInPlace((Split - SubIdx - 1) * NumSubBits); } else { - Bits = Bits.lshr(SubIdx * NumSubBits); + Bits.lshrInPlace(SubIdx * NumSubBits); } if (Split > 1) @@ -15041,7 +16501,7 @@ SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) { /// => /// X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form /// does not require additional intermediate precision] -SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags) { +SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags) { if (Level >= AfterLegalizeDAG) return SDValue(); @@ -15096,7 +16556,7 @@ SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags) { /// As a result, we precompute A/2 prior to the iteration loop. SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est, unsigned Iterations, - SDNodeFlags *Flags, bool Reciprocal) { + SDNodeFlags Flags, bool Reciprocal) { EVT VT = Arg.getValueType(); SDLoc DL(Arg); SDValue ThreeHalves = DAG.getConstantFP(1.5, DL, VT); @@ -15140,7 +16600,7 @@ SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est, /// X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0)) SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est, unsigned Iterations, - SDNodeFlags *Flags, bool Reciprocal) { + SDNodeFlags Flags, bool Reciprocal) { EVT VT = Arg.getValueType(); SDLoc DL(Arg); SDValue MinusThree = DAG.getConstantFP(-3.0, DL, VT); @@ -15185,7 +16645,7 @@ SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est, /// Build code to calculate either rsqrt(Op) or sqrt(Op). In the latter case /// Op*rsqrt(Op) is actually computed, so additional postprocessing is needed if /// Op can be zero. -SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags *Flags, +SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Reciprocal) { if (Level >= AfterLegalizeDAG) return SDValue(); @@ -15238,17 +16698,17 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags *Flags, return SDValue(); } -SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags) { +SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags) { return buildSqrtEstimateImpl(Op, Flags, true); } -SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags *Flags) { +SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags Flags) { return buildSqrtEstimateImpl(Op, Flags, false); } /// Return true if base is a frame index, which is known not to alias with /// anything but itself. Provides base object and offset as results. -static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset, +static bool findBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset, const GlobalValue *&GV, const void *&CV) { // Assume it is a primitive operation. Base = Ptr; Offset = 0; GV = nullptr; CV = nullptr; @@ -15257,7 +16717,7 @@ static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset, if (Base.getOpcode() == ISD::ADD) { if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Base.getOperand(1))) { Base = Base.getOperand(0); - Offset += C->getZExtValue(); + Offset += C->getSExtValue(); } } @@ -15300,54 +16760,82 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const { if (Op1->isInvariant() && Op0->writeMem()) return false; + unsigned NumBytes0 = Op0->getMemoryVT().getSizeInBits() >> 3; + unsigned NumBytes1 = Op1->getMemoryVT().getSizeInBits() >> 3; + + // Check for BaseIndexOffset matching. + BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0->getBasePtr(), DAG); + BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1->getBasePtr(), DAG); + int64_t PtrDiff; + if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff)) + return !((NumBytes0 <= PtrDiff) || (PtrDiff + NumBytes1 <= 0)); + + // If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be + // able to calculate their relative offset if at least one arises + // from an alloca. However, these allocas cannot overlap and we + // can infer there is no alias. + if (auto *A = dyn_cast<FrameIndexSDNode>(BasePtr0.getBase())) + if (auto *B = dyn_cast<FrameIndexSDNode>(BasePtr1.getBase())) { + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + // If the base are the same frame index but the we couldn't find a + // constant offset, (indices are different) be conservative. + if (A != B && (!MFI.isFixedObjectIndex(A->getIndex()) || + !MFI.isFixedObjectIndex(B->getIndex()))) + return false; + } + + // FIXME: findBaseOffset and ConstantValue/GlobalValue/FrameIndex analysis + // modified to use BaseIndexOffset. + // Gather base node and offset information. - SDValue Base1, Base2; - int64_t Offset1, Offset2; - const GlobalValue *GV1, *GV2; - const void *CV1, *CV2; - bool isFrameIndex1 = FindBaseOffset(Op0->getBasePtr(), + SDValue Base0, Base1; + int64_t Offset0, Offset1; + const GlobalValue *GV0, *GV1; + const void *CV0, *CV1; + bool IsFrameIndex0 = findBaseOffset(Op0->getBasePtr(), + Base0, Offset0, GV0, CV0); + bool IsFrameIndex1 = findBaseOffset(Op1->getBasePtr(), Base1, Offset1, GV1, CV1); - bool isFrameIndex2 = FindBaseOffset(Op1->getBasePtr(), - Base2, Offset2, GV2, CV2); - // If they have a same base address then check to see if they overlap. - if (Base1 == Base2 || (GV1 && (GV1 == GV2)) || (CV1 && (CV1 == CV2))) - return !((Offset1 + (Op0->getMemoryVT().getSizeInBits() >> 3)) <= Offset2 || - (Offset2 + (Op1->getMemoryVT().getSizeInBits() >> 3)) <= Offset1); + // If they have the same base address, then check to see if they overlap. + if (Base0 == Base1 || (GV0 && (GV0 == GV1)) || (CV0 && (CV0 == CV1))) + return !((Offset0 + NumBytes0) <= Offset1 || + (Offset1 + NumBytes1) <= Offset0); // It is possible for different frame indices to alias each other, mostly // when tail call optimization reuses return address slots for arguments. // To catch this case, look up the actual index of frame indices to compute // the real alias relationship. - if (isFrameIndex1 && isFrameIndex2) { + if (IsFrameIndex0 && IsFrameIndex1) { MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + Offset0 += MFI.getObjectOffset(cast<FrameIndexSDNode>(Base0)->getIndex()); Offset1 += MFI.getObjectOffset(cast<FrameIndexSDNode>(Base1)->getIndex()); - Offset2 += MFI.getObjectOffset(cast<FrameIndexSDNode>(Base2)->getIndex()); - return !((Offset1 + (Op0->getMemoryVT().getSizeInBits() >> 3)) <= Offset2 || - (Offset2 + (Op1->getMemoryVT().getSizeInBits() >> 3)) <= Offset1); + return !((Offset0 + NumBytes0) <= Offset1 || + (Offset1 + NumBytes1) <= Offset0); } // Otherwise, if we know what the bases are, and they aren't identical, then // we know they cannot alias. - if ((isFrameIndex1 || CV1 || GV1) && (isFrameIndex2 || CV2 || GV2)) + if ((IsFrameIndex0 || CV0 || GV0) && (IsFrameIndex1 || CV1 || GV1)) return false; // If we know required SrcValue1 and SrcValue2 have relatively large alignment // compared to the size and offset of the access, we may be able to prove they - // do not alias. This check is conservative for now to catch cases created by + // do not alias. This check is conservative for now to catch cases created by // splitting vector types. - if ((Op0->getOriginalAlignment() == Op1->getOriginalAlignment()) && - (Op0->getSrcValueOffset() != Op1->getSrcValueOffset()) && - (Op0->getMemoryVT().getSizeInBits() >> 3 == - Op1->getMemoryVT().getSizeInBits() >> 3) && - (Op0->getOriginalAlignment() > (Op0->getMemoryVT().getSizeInBits() >> 3))) { - int64_t OffAlign1 = Op0->getSrcValueOffset() % Op0->getOriginalAlignment(); - int64_t OffAlign2 = Op1->getSrcValueOffset() % Op1->getOriginalAlignment(); + int64_t SrcValOffset0 = Op0->getSrcValueOffset(); + int64_t SrcValOffset1 = Op1->getSrcValueOffset(); + unsigned OrigAlignment0 = Op0->getOriginalAlignment(); + unsigned OrigAlignment1 = Op1->getOriginalAlignment(); + if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 && + NumBytes0 == NumBytes1 && OrigAlignment0 > NumBytes0) { + int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0; + int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1; // There is no overlap between these relatively aligned accesses of similar - // size, return no alias. - if ((OffAlign1 + (Op0->getMemoryVT().getSizeInBits() >> 3)) <= OffAlign2 || - (OffAlign2 + (Op1->getMemoryVT().getSizeInBits() >> 3)) <= OffAlign1) + // size. Return no alias. + if ((OffAlign0 + NumBytes0) <= OffAlign1 || + (OffAlign1 + NumBytes1) <= OffAlign0) return false; } @@ -15359,20 +16847,18 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const { CombinerAAOnlyFunc != DAG.getMachineFunction().getName()) UseAA = false; #endif - if (UseAA && + + if (UseAA && AA && Op0->getMemOperand()->getValue() && Op1->getMemOperand()->getValue()) { // Use alias analysis information. - int64_t MinOffset = std::min(Op0->getSrcValueOffset(), - Op1->getSrcValueOffset()); - int64_t Overlap1 = (Op0->getMemoryVT().getSizeInBits() >> 3) + - Op0->getSrcValueOffset() - MinOffset; - int64_t Overlap2 = (Op1->getMemoryVT().getSizeInBits() >> 3) + - Op1->getSrcValueOffset() - MinOffset; + int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1); + int64_t Overlap0 = NumBytes0 + SrcValOffset0 - MinOffset; + int64_t Overlap1 = NumBytes1 + SrcValOffset1 - MinOffset; AliasResult AAResult = - AA.alias(MemoryLocation(Op0->getMemOperand()->getValue(), Overlap1, - UseTBAA ? Op0->getAAInfo() : AAMDNodes()), - MemoryLocation(Op1->getMemOperand()->getValue(), Overlap2, - UseTBAA ? Op1->getAAInfo() : AAMDNodes())); + AA->alias(MemoryLocation(Op0->getMemOperand()->getValue(), Overlap0, + UseTBAA ? Op0->getAAInfo() : AAMDNodes()), + MemoryLocation(Op1->getMemOperand()->getValue(), Overlap1, + UseTBAA ? Op1->getAAInfo() : AAMDNodes()) ); if (AAResult == NoAlias) return false; } @@ -15454,6 +16940,12 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain, ++Depth; break; + case ISD::CopyFromReg: + // Forward past CopyFromReg. + Chains.push_back(Chain.getOperand(0)); + ++Depth; + break; + default: // For all other instructions we will just have to take what we can get. Aliases.push_back(Chain); @@ -15482,17 +16974,29 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) { return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases); } +// This function tries to collect a bunch of potentially interesting +// nodes to improve the chains of, all at once. This might seem +// redundant, as this function gets called when visiting every store +// node, so why not let the work be done on each store as it's visited? +// +// I believe this is mainly important because MergeConsecutiveStores +// is unable to deal with merging stores of different sizes, so unless +// we improve the chains of all the potential candidates up-front +// before running MergeConsecutiveStores, it might only see some of +// the nodes that will eventually be candidates, and then not be able +// to go from a partially-merged state to the desired final +// fully-merged state. bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { // This holds the base pointer, index, and the offset in bytes from the base // pointer. BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG); // We must have a base and an offset. - if (!BasePtr.Base.getNode()) + if (!BasePtr.getBase().getNode()) return false; // Do not handle stores to undef base pointers. - if (BasePtr.Base.isUndef()) + if (BasePtr.getBase().isUndef()) return false; SmallVector<StoreSDNode *, 8> ChainedStores; @@ -15514,13 +17018,11 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr(), DAG); // Check that the base pointer is the same as the original one. - if (!Ptr.equalBaseIndex(BasePtr)) + if (!BasePtr.equalBaseIndex(Ptr, DAG)) break; - // Find the next memory operand in the chain. If the next operand in the - // chain is a store then move up and continue the scan with the next - // memory operand. If the next operand is a load save it and use alias - // information to check if it interferes with anything. + // Walk up the chain to find the next store node, ignoring any + // intermediate loads. Any other kind of node will halt the loop. SDNode *NextInChain = Index->getChain().getNode(); while (true) { if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) { @@ -15539,9 +17041,14 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { Index = nullptr; break; } - } + } // end while } + // At this point, ChainedStores lists all of the Store nodes + // reachable by iterating up through chain nodes matching the above + // conditions. For each such store identified, try to find an + // earlier chain to attach the store to which won't violate the + // required ordering. bool MadeChangeToSt = false; SmallVector<std::pair<StoreSDNode *, SDValue>, 8> BetterChains; @@ -15565,7 +17072,7 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { } /// This is the entry point for the file. -void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis &AA, +void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis *AA, CodeGenOpt::Level OptLevel) { /// This is the main entry point to this class. DAGCombiner(*this, AA, OptLevel).Run(Level); diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index e2f33bb..b2599b2 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -1,4 +1,4 @@ -//===-- FastISel.cpp - Implementation of the FastISel class ---------------===// +//===- FastISel.cpp - Implementation of the FastISel class ----------------===// // // The LLVM Compiler Infrastructure // @@ -39,35 +39,76 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/FastISel.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APSInt.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BranchProbabilityInfo.h" -#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/CodeGen/Analysis.h" -#include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/StackMaps.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" -#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Mangler.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> +#include <utility> + using namespace llvm; #define DEBUG_TYPE "isel" @@ -78,21 +119,6 @@ STATISTIC(NumFastIselSuccessTarget, "Number of insts selected by " "target-specific selector"); STATISTIC(NumFastIselDead, "Number of dead insts removed on failure"); -void FastISel::ArgListEntry::setAttributes(ImmutableCallSite *CS, - unsigned AttrIdx) { - IsSExt = CS->paramHasAttr(AttrIdx, Attribute::SExt); - IsZExt = CS->paramHasAttr(AttrIdx, Attribute::ZExt); - IsInReg = CS->paramHasAttr(AttrIdx, Attribute::InReg); - IsSRet = CS->paramHasAttr(AttrIdx, Attribute::StructRet); - IsNest = CS->paramHasAttr(AttrIdx, Attribute::Nest); - IsByVal = CS->paramHasAttr(AttrIdx, Attribute::ByVal); - IsInAlloca = CS->paramHasAttr(AttrIdx, Attribute::InAlloca); - IsReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned); - IsSwiftSelf = CS->paramHasAttr(AttrIdx, Attribute::SwiftSelf); - IsSwiftError = CS->paramHasAttr(AttrIdx, Attribute::SwiftError); - Alignment = CS->getParamAlignment(AttrIdx); -} - /// Set the current block to which generated machine instructions will be /// appended, and clear the local CSE map. void FastISel::startNewBlock() { @@ -231,17 +257,13 @@ unsigned FastISel::materializeConstant(const Value *V, MVT VT) { // Try to emit the constant by using an integer constant with a cast. const APFloat &Flt = CF->getValueAPF(); EVT IntVT = TLI.getPointerTy(DL); - - uint64_t x[2]; uint32_t IntBitWidth = IntVT.getSizeInBits(); + APSInt SIntVal(IntBitWidth, /*isUnsigned=*/false); bool isExact; - (void)Flt.convertToInteger(x, IntBitWidth, /*isSigned=*/true, - APFloat::rmTowardZero, &isExact); + (void)Flt.convertToInteger(SIntVal, APFloat::rmTowardZero, &isExact); if (isExact) { - APInt IntVal(IntBitWidth, x); - unsigned IntegerReg = - getRegForValue(ConstantInt::get(V->getContext(), IntVal)); + getRegForValue(ConstantInt::get(V->getContext(), SIntVal)); if (IntegerReg != 0) Reg = fastEmit_r(IntVT.getSimpleVT(), VT, ISD::SINT_TO_FP, IntegerReg, /*Kill=*/false); @@ -600,7 +622,7 @@ bool FastISel::selectStackmap(const CallInst *I) { // have to worry about calling conventions and target-specific lowering code. // Instead we perform the call lowering right here. // - // CALLSEQ_START(0...) + // CALLSEQ_START(0, 0...) // STACKMAP(id, nbytes, ...) // CALLSEQ_END(0, 0) // @@ -646,7 +668,7 @@ bool FastISel::selectStackmap(const CallInst *I) { MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::STACKMAP)); for (auto const &MO : Ops) - MIB.addOperand(MO); + MIB.add(MO); // Issue CALLSEQ_END unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); @@ -672,10 +694,8 @@ bool FastISel::lowerCallOperands(const CallInst *CI, unsigned ArgIdx, Args.reserve(NumArgs); // Populate the argument list. - // Attributes for args start at offset 1, after the return attribute. ImmutableCallSite CS(CI); - for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs, AttrI = ArgIdx + 1; - ArgI != ArgE; ++ArgI) { + for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs; ArgI != ArgE; ++ArgI) { Value *V = CI->getOperand(ArgI); assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic."); @@ -683,7 +703,7 @@ bool FastISel::lowerCallOperands(const CallInst *CI, unsigned ArgIdx, ArgListEntry Entry; Entry.Val = V; Entry.Ty = V->getType(); - Entry.setAttributes(&CS, AttrI); + Entry.setAttributes(&CS, ArgIdx); Args.push_back(Entry); } @@ -826,7 +846,7 @@ bool FastISel::selectPatchpoint(const CallInst *I) { TII.get(TargetOpcode::PATCHPOINT)); for (auto &MO : Ops) - MIB.addOperand(MO); + MIB.add(MO); MIB->setPhysRegsDeadExcept(CLI.InRegs, TRI); @@ -841,9 +861,28 @@ bool FastISel::selectPatchpoint(const CallInst *I) { return true; } -/// Returns an AttributeSet representing the attributes applied to the return +bool FastISel::selectXRayCustomEvent(const CallInst *I) { + const auto &Triple = TM.getTargetTriple(); + if (Triple.getArch() != Triple::x86_64 || !Triple.isOSLinux()) + return true; // don't do anything to this instruction. + SmallVector<MachineOperand, 8> Ops; + Ops.push_back(MachineOperand::CreateReg(getRegForValue(I->getArgOperand(0)), + /*IsDef=*/false)); + Ops.push_back(MachineOperand::CreateReg(getRegForValue(I->getArgOperand(1)), + /*IsDef=*/false)); + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::PATCHABLE_EVENT_CALL)); + for (auto &MO : Ops) + MIB.add(MO); + // Insert the Patchable Event Call instruction, that gets lowered properly. + return true; +} + + +/// Returns an AttributeList representing the attributes applied to the return /// value of the given call. -static AttributeSet getReturnAttrs(FastISel::CallLoweringInfo &CLI) { +static AttributeList getReturnAttrs(FastISel::CallLoweringInfo &CLI) { SmallVector<Attribute::AttrKind, 2> Attrs; if (CLI.RetSExt) Attrs.push_back(Attribute::SExt); @@ -852,8 +891,8 @@ static AttributeSet getReturnAttrs(FastISel::CallLoweringInfo &CLI) { if (CLI.IsInReg) Attrs.push_back(Attribute::InReg); - return AttributeSet::get(CLI.RetTy->getContext(), AttributeSet::ReturnIndex, - Attrs); + return AttributeList::get(CLI.RetTy->getContext(), AttributeList::ReturnIndex, + Attrs); } bool FastISel::lowerCallTo(const CallInst *CI, const char *SymName, @@ -885,9 +924,10 @@ bool FastISel::lowerCallTo(const CallInst *CI, MCSymbol *Symbol, ArgListEntry Entry; Entry.Val = V; Entry.Ty = V->getType(); - Entry.setAttributes(&CS, ArgI + 1); + Entry.setAttributes(&CS, ArgI); Args.push_back(Entry); } + TLI.markLibCallAttributes(MF, CS.getCallingConv(), Args); CallLoweringInfo CLI; CLI.setCallee(RetTy, FTy, Symbol, std::move(Args), CS, NumArgs); @@ -1021,7 +1061,7 @@ bool FastISel::lowerCall(const CallInst *CI) { Entry.Ty = V->getType(); // Skip the first return-type Attribute to get to params. - Entry.setAttributes(&CS, i - CS.arg_begin() + 1); + Entry.setAttributes(&CS, i - CS.arg_begin()); Args.push_back(Entry); } @@ -1110,16 +1150,16 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) { return true; } - unsigned Offset = 0; + // Byval arguments with frame indices were already handled after argument + // lowering and before isel. + const auto *Arg = + dyn_cast<Argument>(Address->stripInBoundsConstantOffsets()); + if (Arg && FuncInfo.getArgumentFrameIndex(Arg) != INT_MAX) + return true; + Optional<MachineOperand> Op; - if (const auto *Arg = dyn_cast<Argument>(Address)) - // Some arguments' frame index is recorded during argument lowering. - Offset = FuncInfo.getArgumentFrameIndex(Arg); - if (Offset) - Op = MachineOperand::CreateFI(Offset); - if (!Op) - if (unsigned Reg = lookUpRegForValue(Address)) - Op = MachineOperand::CreateReg(Reg, false); + if (unsigned Reg = lookUpRegForValue(Address)) + Op = MachineOperand::CreateReg(Reg, false); // If we have a VLA that has a "use" in a metadata node that's then used // here but it has no other uses, then we have a problem. E.g., @@ -1143,13 +1183,15 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) { "Expected inlined-at fields to agree"); if (Op->isReg()) { Op->setIsDebug(true); + // A dbg.declare describes the address of a source variable, so lower it + // into an indirect DBG_VALUE. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::DBG_VALUE), false, Op->getReg(), 0, - DI->getVariable(), DI->getExpression()); + TII.get(TargetOpcode::DBG_VALUE), /*IsIndirect*/ true, + Op->getReg(), 0, DI->getVariable(), DI->getExpression()); } else BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::DBG_VALUE)) - .addOperand(*Op) + .add(*Op) .addImm(0) .addMetadata(DI->getVariable()) .addMetadata(DI->getExpression()); @@ -1229,6 +1271,9 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) { case Intrinsic::experimental_patchpoint_void: case Intrinsic::experimental_patchpoint_i64: return selectPatchpoint(II); + + case Intrinsic::xray_customevent: + return selectXRayCustomEvent(II); } return fastLowerIntrinsicCall(II); @@ -1362,7 +1407,7 @@ bool FastISel::selectInstruction(const Instruction *I) { if (const auto *Call = dyn_cast<CallInst>(I)) { const Function *F = Call->getCalledFunction(); - LibFunc::Func Func; + LibFunc Func; // As a special case, don't handle calls to builtin library functions that // may be translated directly to target instructions. @@ -1665,7 +1710,7 @@ FastISel::FastISel(FunctionLoweringInfo &FuncInfo, TRI(*MF->getSubtarget().getRegisterInfo()), LibInfo(LibInfo), SkipTargetIndependentISel(SkipTargetIndependentISel) {} -FastISel::~FastISel() {} +FastISel::~FastISel() = default; bool FastISel::fastLowerArguments() { return false; } diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp index 377a523..b736037 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -85,7 +85,6 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, MF = &mf; TLI = MF->getSubtarget().getTargetLowering(); RegInfo = &MF->getRegInfo(); - MachineModuleInfo &MMI = MF->getMMI(); const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); unsigned StackAlign = TFI->getStackAlignment(); @@ -214,33 +213,6 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, if (!isa<AllocaInst>(I) || !StaticAllocaMap.count(cast<AllocaInst>(&I))) InitializeRegForValue(&I); - // Collect llvm.dbg.declare information. This is done now instead of - // during the initial isel pass through the IR so that it is done - // in a predictable order. - if (const DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(&I)) { - assert(DI->getVariable() && "Missing variable"); - assert(DI->getDebugLoc() && "Missing location"); - if (MMI.hasDebugInfo()) { - // Don't handle byval struct arguments or VLAs, for example. - // Non-byval arguments are handled here (they refer to the stack - // temporary alloca at this point). - const Value *Address = DI->getAddress(); - if (Address) { - if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Address)) - Address = BCI->getOperand(0); - if (const AllocaInst *AI = dyn_cast<AllocaInst>(Address)) { - DenseMap<const AllocaInst *, int>::iterator SI = - StaticAllocaMap.find(AI); - if (SI != StaticAllocaMap.end()) { // Check for VLAs. - int FI = SI->second; - MF->setVariableDbgInfo(DI->getVariable(), DI->getExpression(), - FI, DI->getDebugLoc()); - } - } - } - } - } - // Decide the preferred extend type for a value. PreferredExtendType[&I] = getPreferredExtendForValue(&I); } @@ -400,10 +372,9 @@ FunctionLoweringInfo::GetLiveOutRegInfo(unsigned Reg, unsigned BitWidth) { if (!LOI->IsValid) return nullptr; - if (BitWidth > LOI->KnownZero.getBitWidth()) { + if (BitWidth > LOI->Known.getBitWidth()) { LOI->NumSignBits = 1; - LOI->KnownZero = LOI->KnownZero.zextOrTrunc(BitWidth); - LOI->KnownOne = LOI->KnownOne.zextOrTrunc(BitWidth); + LOI->Known = LOI->Known.zextOrTrunc(BitWidth); } return LOI; @@ -436,17 +407,15 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) { Value *V = PN->getIncomingValue(0); if (isa<UndefValue>(V) || isa<ConstantExpr>(V)) { DestLOI.NumSignBits = 1; - APInt Zero(BitWidth, 0); - DestLOI.KnownZero = Zero; - DestLOI.KnownOne = Zero; + DestLOI.Known = KnownBits(BitWidth); return; } if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) { APInt Val = CI->getValue().zextOrTrunc(BitWidth); DestLOI.NumSignBits = Val.getNumSignBits(); - DestLOI.KnownZero = ~Val; - DestLOI.KnownOne = Val; + DestLOI.Known.Zero = ~Val; + DestLOI.Known.One = Val; } else { assert(ValueMap.count(V) && "V should have been placed in ValueMap when its" "CopyToReg node was created."); @@ -463,25 +432,23 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) { DestLOI = *SrcLOI; } - assert(DestLOI.KnownZero.getBitWidth() == BitWidth && - DestLOI.KnownOne.getBitWidth() == BitWidth && + assert(DestLOI.Known.Zero.getBitWidth() == BitWidth && + DestLOI.Known.One.getBitWidth() == BitWidth && "Masks should have the same bit width as the type."); for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) { Value *V = PN->getIncomingValue(i); if (isa<UndefValue>(V) || isa<ConstantExpr>(V)) { DestLOI.NumSignBits = 1; - APInt Zero(BitWidth, 0); - DestLOI.KnownZero = Zero; - DestLOI.KnownOne = Zero; + DestLOI.Known = KnownBits(BitWidth); return; } if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) { APInt Val = CI->getValue().zextOrTrunc(BitWidth); DestLOI.NumSignBits = std::min(DestLOI.NumSignBits, Val.getNumSignBits()); - DestLOI.KnownZero &= ~Val; - DestLOI.KnownOne &= Val; + DestLOI.Known.Zero &= ~Val; + DestLOI.Known.One &= Val; continue; } @@ -498,8 +465,8 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) { return; } DestLOI.NumSignBits = std::min(DestLOI.NumSignBits, SrcLOI->NumSignBits); - DestLOI.KnownZero &= SrcLOI->KnownZero; - DestLOI.KnownOne &= SrcLOI->KnownOne; + DestLOI.Known.Zero &= SrcLOI->Known.Zero; + DestLOI.Known.One &= SrcLOI->Known.One; } } @@ -515,12 +482,11 @@ void FunctionLoweringInfo::setArgumentFrameIndex(const Argument *A, /// If the argument does not have any assigned frame index then 0 is /// returned. int FunctionLoweringInfo::getArgumentFrameIndex(const Argument *A) { - DenseMap<const Argument *, int>::iterator I = - ByValArgFrameIndexMap.find(A); + auto I = ByValArgFrameIndexMap.find(A); if (I != ByValArgFrameIndexMap.end()) return I->second; DEBUG(dbgs() << "Argument does not have assigned frame index!\n"); - return 0; + return INT_MAX; } unsigned FunctionLoweringInfo::getCatchPadExceptionPointerVReg( @@ -557,3 +523,29 @@ void FunctionLoweringInfo::setCurrentSwiftErrorVReg( const MachineBasicBlock *MBB, const Value *Val, unsigned VReg) { SwiftErrorVRegDefMap[std::make_pair(MBB, Val)] = VReg; } + +std::pair<unsigned, bool> +FunctionLoweringInfo::getOrCreateSwiftErrorVRegDefAt(const Instruction *I) { + auto Key = PointerIntPair<const Instruction *, 1, bool>(I, true); + auto It = SwiftErrorVRegDefUses.find(Key); + if (It == SwiftErrorVRegDefUses.end()) { + auto &DL = MF->getDataLayout(); + const TargetRegisterClass *RC = TLI->getRegClassFor(TLI->getPointerTy(DL)); + unsigned VReg = MF->getRegInfo().createVirtualRegister(RC); + SwiftErrorVRegDefUses[Key] = VReg; + return std::make_pair(VReg, true); + } + return std::make_pair(It->second, false); +} + +std::pair<unsigned, bool> +FunctionLoweringInfo::getOrCreateSwiftErrorVRegUseAt(const Instruction *I, const MachineBasicBlock *MBB, const Value *Val) { + auto Key = PointerIntPair<const Instruction *, 1, bool>(I, false); + auto It = SwiftErrorVRegDefUses.find(Key); + if (It == SwiftErrorVRegDefUses.end()) { + unsigned VReg = getOrCreateSwiftErrorVReg(MBB, Val); + SwiftErrorVRegDefUses[Key] = VReg; + return std::make_pair(VReg, true); + } + return std::make_pair(It->second, false); +} diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index 4a9042c..b96c96f 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -161,7 +161,8 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned, if (VRBase) { DstRC = MRI->getRegClass(VRBase); } else if (UseRC) { - assert(UseRC->hasType(VT) && "Incompatible phys register def and uses!"); + assert(TRI->isTypeLegalForClass(*UseRC, VT) && + "Incompatible phys register def and uses!"); DstRC = UseRC; } else { DstRC = TLI->getRegClassFor(VT); @@ -235,7 +236,6 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node, if (II.OpInfo[i].isOptionalDef()) { // Optional def must be a physical register. - unsigned NumResults = CountResults(Node); VRBase = cast<RegisterSDNode>(Node->getOperand(i-NumResults))->getReg(); assert(TargetRegisterInfo::isPhysicalRegister(VRBase)); MIB.addReg(VRBase, RegState::Define); @@ -589,7 +589,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node, } else AddOperand(MIB, N0, 0, nullptr, VRBaseMap, /*IsDebug=*/false, IsClone, IsCloned); - // Add the subregster being inserted + // Add the subregister being inserted AddOperand(MIB, N1, 0, nullptr, VRBaseMap, /*IsDebug=*/false, IsClone, IsCloned); MIB.addImm(SubIdx); diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index b002825..7e4bc3c 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -899,6 +899,35 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) { } } +static TargetLowering::LegalizeAction +getStrictFPOpcodeAction(const TargetLowering &TLI, unsigned Opcode, EVT VT) { + unsigned EqOpc; + switch (Opcode) { + default: llvm_unreachable("Unexpected FP pseudo-opcode"); + case ISD::STRICT_FSQRT: EqOpc = ISD::FSQRT; break; + case ISD::STRICT_FPOW: EqOpc = ISD::FPOW; break; + case ISD::STRICT_FPOWI: EqOpc = ISD::FPOWI; break; + case ISD::STRICT_FSIN: EqOpc = ISD::FSIN; break; + case ISD::STRICT_FCOS: EqOpc = ISD::FCOS; break; + case ISD::STRICT_FEXP: EqOpc = ISD::FEXP; break; + case ISD::STRICT_FEXP2: EqOpc = ISD::FEXP2; break; + case ISD::STRICT_FLOG: EqOpc = ISD::FLOG; break; + case ISD::STRICT_FLOG10: EqOpc = ISD::FLOG10; break; + case ISD::STRICT_FLOG2: EqOpc = ISD::FLOG2; break; + case ISD::STRICT_FRINT: EqOpc = ISD::FRINT; break; + case ISD::STRICT_FNEARBYINT: EqOpc = ISD::FNEARBYINT; break; + } + + auto Action = TLI.getOperationAction(EqOpc, VT); + + // We don't currently handle Custom or Promote for strict FP pseudo-ops. + // For now, we just expand for those cases. + if (Action != TargetLowering::Legal) + Action = TargetLowering::Expand; + + return Action; +} + /// Return a legal replacement for the given operation, with all legal operands. void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { DEBUG(dbgs() << "\nLegalizing: "; Node->dump(&DAG)); @@ -994,7 +1023,6 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { break; case ISD::EXTRACT_ELEMENT: case ISD::FLT_ROUNDS_: - case ISD::FPOWI: case ISD::MERGE_VALUES: case ISD::EH_RETURN: case ISD::FRAME_TO_ARGS_OFFSET: @@ -1043,6 +1071,25 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { return; } break; + case ISD::STRICT_FSQRT: + case ISD::STRICT_FPOW: + case ISD::STRICT_FPOWI: + case ISD::STRICT_FSIN: + case ISD::STRICT_FCOS: + case ISD::STRICT_FEXP: + case ISD::STRICT_FEXP2: + case ISD::STRICT_FLOG: + case ISD::STRICT_FLOG10: + case ISD::STRICT_FLOG2: + case ISD::STRICT_FRINT: + case ISD::STRICT_FNEARBYINT: + // These pseudo-ops get legalized as if they were their non-strict + // equivalent. For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT + // is also legal, but if ISD::FSQRT requires expansion then so does + // ISD::STRICT_FSQRT. + Action = getStrictFPOpcodeAction(TLI, Node->getOpcode(), + Node->getValueType(0)); + break; default: if (Node->getOpcode() >= ISD::BUILTIN_OP_END) { @@ -1192,8 +1239,11 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) { // If the index is dependent on the store we will introduce a cycle when // creating the load (the load uses the index, and by replacing the chain - // we will make the index dependent on the load). - if (SDNode::hasPredecessorHelper(ST, Visited, Worklist)) + // we will make the index dependent on the load). Also, the store might be + // dependent on the extractelement and introduce a cycle when creating + // the load. + if (SDNode::hasPredecessorHelper(ST, Visited, Worklist) || + ST->hasPredecessor(Op.getNode())) continue; StackPtr = ST->getBasePtr(); @@ -1340,7 +1390,7 @@ void SelectionDAGLegalize::getSignAsIntValue(FloatSignAsInt &State, // Convert to an integer of the same size. if (TLI.isTypeLegal(IVT)) { State.IntValue = DAG.getNode(ISD::BITCAST, DL, IVT, Value); - State.SignMask = APInt::getSignBit(NumBits); + State.SignMask = APInt::getSignMask(NumBits); State.SignBit = NumBits - 1; return; } @@ -1490,7 +1540,7 @@ void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node, // Chain the dynamic stack allocation so that it doesn't modify the stack // pointer when other instructions are using the stack. - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl); + Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); SDValue Size = Tmp2.getOperand(1); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); @@ -1909,8 +1959,8 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Node = Op; Entry.Ty = ArgTy; - Entry.isSExt = isSigned; - Entry.isZExt = !isSigned; + Entry.IsSExt = isSigned; + Entry.IsZExt = !isSigned; Args.push_back(Entry); } SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -1935,9 +1985,14 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, InChain = TCChain; TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(SDLoc(Node)).setChain(InChain) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) - .setTailCall(isTailCall).setSExtResult(isSigned).setZExtResult(!isSigned); + CLI.setDebugLoc(SDLoc(Node)) + .setChain(InChain) + .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, + std::move(Args)) + .setTailCall(isTailCall) + .setSExtResult(isSigned) + .setZExtResult(!isSigned) + .setIsPostTypeLegalization(true); std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); @@ -1960,8 +2015,8 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT, for (unsigned i = 0; i != NumOps; ++i) { Entry.Node = Ops[i]; Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext()); - Entry.isSExt = isSigned; - Entry.isZExt = !isSigned; + Entry.IsSExt = isSigned; + Entry.IsZExt = !isSigned; Args.push_back(Entry); } SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -1970,9 +2025,13 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT, Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext()); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) - .setSExtResult(isSigned).setZExtResult(!isSigned); + CLI.setDebugLoc(dl) + .setChain(DAG.getEntryNode()) + .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, + std::move(Args)) + .setSExtResult(isSigned) + .setZExtResult(!isSigned) + .setIsPostTypeLegalization(true); std::pair<SDValue,SDValue> CallInfo = TLI.LowerCallTo(CLI); @@ -1994,8 +2053,8 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC, Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Node = Node->getOperand(i); Entry.Ty = ArgTy; - Entry.isSExt = isSigned; - Entry.isZExt = !isSigned; + Entry.IsSExt = isSigned; + Entry.IsZExt = !isSigned; Args.push_back(Entry); } SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -2004,9 +2063,12 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC, Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext()); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(SDLoc(Node)).setChain(InChain) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) - .setSExtResult(isSigned).setZExtResult(!isSigned); + CLI.setDebugLoc(SDLoc(Node)) + .setChain(InChain) + .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, + std::move(Args)) + .setSExtResult(isSigned) + .setZExtResult(!isSigned); std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); @@ -2019,6 +2081,9 @@ SDValue SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node, RTLIB::Libcall Call_F80, RTLIB::Libcall Call_F128, RTLIB::Libcall Call_PPCF128) { + if (Node->isStrictFPOpcode()) + Node = DAG.mutateStrictFPToFP(Node); + RTLIB::Libcall LC; switch (Node->getSimpleValueType(0).SimpleTy) { default: llvm_unreachable("Unexpected request for libcall!"); @@ -2081,8 +2146,8 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Node = Op; Entry.Ty = ArgTy; - Entry.isSExt = isSigned; - Entry.isZExt = !isSigned; + Entry.IsSExt = isSigned; + Entry.IsZExt = !isSigned; Args.push_back(Entry); } @@ -2090,8 +2155,8 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, SDValue FIPtr = DAG.CreateStackTemporary(RetVT); Entry.Node = FIPtr; Entry.Ty = RetTy->getPointerTo(); - Entry.isSExt = isSigned; - Entry.isZExt = !isSigned; + Entry.IsSExt = isSigned; + Entry.IsZExt = !isSigned; Args.push_back(Entry); SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -2099,9 +2164,12 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, SDLoc dl(Node); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(InChain) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) - .setSExtResult(isSigned).setZExtResult(!isSigned); + CLI.setDebugLoc(dl) + .setChain(InChain) + .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, + std::move(Args)) + .setSExtResult(isSigned) + .setZExtResult(!isSigned); std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); @@ -2126,19 +2194,6 @@ static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) { return TLI.getLibcallName(LC) != nullptr; } -/// Return true if sincos libcall is available and can be used to combine sin -/// and cos. -static bool canCombineSinCosLibcall(SDNode *Node, const TargetLowering &TLI, - const TargetMachine &TM) { - if (!isSinCosLibcallAvailable(Node, TLI)) - return false; - // GNU sin/cos functions set errno while sincos does not. Therefore - // combining sin and cos is only safe if unsafe-fpmath is enabled. - if (TM.getTargetTriple().isGNUEnvironment() && !TM.Options.UnsafeFPMath) - return false; - return true; -} - /// Only issue sincos libcall if both sin and cos are needed. static bool useSinCos(SDNode *Node) { unsigned OtherOpcode = Node->getOpcode() == ISD::FSIN @@ -2185,24 +2240,24 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node, // Pass the argument. Entry.Node = Node->getOperand(0); Entry.Ty = RetTy; - Entry.isSExt = false; - Entry.isZExt = false; + Entry.IsSExt = false; + Entry.IsZExt = false; Args.push_back(Entry); // Pass the return address of sin. SDValue SinPtr = DAG.CreateStackTemporary(RetVT); Entry.Node = SinPtr; Entry.Ty = RetTy->getPointerTo(); - Entry.isSExt = false; - Entry.isZExt = false; + Entry.IsSExt = false; + Entry.IsZExt = false; Args.push_back(Entry); // Also pass the return address of the cos. SDValue CosPtr = DAG.CreateStackTemporary(RetVT); Entry.Node = CosPtr; Entry.Ty = RetTy->getPointerTo(); - Entry.isSExt = false; - Entry.isZExt = false; + Entry.IsSExt = false; + Entry.IsZExt = false; Args.push_back(Entry); SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -2210,9 +2265,9 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node, SDLoc dl(Node); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(InChain) - .setCallee(TLI.getLibcallCallingConv(LC), - Type::getVoidTy(*DAG.getContext()), Callee, std::move(Args)); + CLI.setDebugLoc(dl).setChain(InChain).setLibCallee( + TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee, + std::move(Args)); std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); @@ -2529,12 +2584,12 @@ SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, const SDLoc &dl) { APInt MaskHi4(Sz, 0), MaskHi2(Sz, 0), MaskHi1(Sz, 0); APInt MaskLo4(Sz, 0), MaskLo2(Sz, 0), MaskLo1(Sz, 0); for (unsigned J = 0; J != Sz; J += 8) { - MaskHi4 = MaskHi4.Or(APInt(Sz, 0xF0ull << J)); - MaskLo4 = MaskLo4.Or(APInt(Sz, 0x0Full << J)); - MaskHi2 = MaskHi2.Or(APInt(Sz, 0xCCull << J)); - MaskLo2 = MaskLo2.Or(APInt(Sz, 0x33ull << J)); - MaskHi1 = MaskHi1.Or(APInt(Sz, 0xAAull << J)); - MaskLo1 = MaskLo1.Or(APInt(Sz, 0x55ull << J)); + MaskHi4 = MaskHi4 | (0xF0ull << J); + MaskLo4 = MaskLo4 | (0x0Full << J); + MaskHi2 = MaskHi2 | (0xCCull << J); + MaskLo2 = MaskLo2 | (0x33ull << J); + MaskHi1 = MaskHi1 | (0xAAull << J); + MaskLo1 = MaskLo1 | (0x55ull << J); } // BSWAP if the type is wider than a single byte. @@ -2573,7 +2628,7 @@ SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, const SDLoc &dl) { DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(I - J, dl, SHVT)); APInt Shift(Sz, 1); - Shift = Shift.shl(J); + Shift <<= J; Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Shift, dl, VT)); Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp, Tmp2); } @@ -2968,7 +3023,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { EVT NVT = Node->getValueType(0); APFloat apf(DAG.EVTToAPFloatSemantics(VT), APInt::getNullValue(VT.getSizeInBits())); - APInt x = APInt::getSignBit(NVT.getSizeInBits()); + APInt x = APInt::getSignMask(NVT.getSizeInBits()); (void)apf.convertFromAPInt(x, false, APFloat::rmNearestTiesToEven); Tmp1 = DAG.getConstantFP(apf, dl, VT); Tmp2 = DAG.getSetCC(dl, getSetCCResultType(VT), @@ -3091,7 +3146,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { TLI.getVectorIdxTy(DAG.getDataLayout())))); } - Tmp1 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + Tmp1 = DAG.getBuildVector(VT, dl, Ops); // We may have changed the BUILD_VECTOR type. Cast it back to the Node type. Tmp1 = DAG.getNode(ISD::BITCAST, dl, Node->getValueType(0), Tmp1); Results.push_back(Tmp1); @@ -3181,7 +3236,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { // Turn fsin / fcos into ISD::FSINCOS node if there are a pair of fsin / // fcos which share the same operand and both are used. if ((TLI.isOperationLegalOrCustom(ISD::FSINCOS, VT) || - canCombineSinCosLibcall(Node, TLI, TM)) + isSinCosLibcallAvailable(Node, TLI)) && useSinCos(Node)) { SDVTList VTs = DAG.getVTList(VT, VT); Tmp1 = DAG.getNode(ISD::FSINCOS, dl, VTs, Node->getOperand(0)); @@ -3237,7 +3292,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { EVT VT = Node->getValueType(0); if (TLI.isOperationLegalOrCustom(ISD::FADD, VT) && TLI.isOperationLegalOrCustom(ISD::FNEG, VT)) { - const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(Node)->Flags; + const SDNodeFlags Flags = Node->getFlags(); Tmp1 = DAG.getNode(ISD::FNEG, dl, VT, Node->getOperand(1)); Tmp1 = DAG.getNode(ISD::FADD, dl, VT, Node->getOperand(0), Tmp1, Flags); Results.push_back(Tmp1); @@ -3477,17 +3532,24 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { LC = RTLIB::MUL_I128; assert(LC != RTLIB::UNKNOWN_LIBCALL && "Cannot expand this operation!"); - // The high part is obtained by SRA'ing all but one of the bits of low - // part. - unsigned LoSize = VT.getSizeInBits(); - SDValue HiLHS = - DAG.getNode(ISD::SRA, dl, VT, RHS, - DAG.getConstant(LoSize - 1, dl, - TLI.getPointerTy(DAG.getDataLayout()))); - SDValue HiRHS = - DAG.getNode(ISD::SRA, dl, VT, LHS, - DAG.getConstant(LoSize - 1, dl, - TLI.getPointerTy(DAG.getDataLayout()))); + SDValue HiLHS; + SDValue HiRHS; + if (isSigned) { + // The high part is obtained by SRA'ing all but one of the bits of low + // part. + unsigned LoSize = VT.getSizeInBits(); + HiLHS = + DAG.getNode(ISD::SRA, dl, VT, LHS, + DAG.getConstant(LoSize - 1, dl, + TLI.getPointerTy(DAG.getDataLayout()))); + HiRHS = + DAG.getNode(ISD::SRA, dl, VT, RHS, + DAG.getConstant(LoSize - 1, dl, + TLI.getPointerTy(DAG.getDataLayout()))); + } else { + HiLHS = DAG.getConstant(0, dl, VT); + HiRHS = DAG.getConstant(0, dl, VT); + } // Here we're passing the 2 arguments explicitly as 4 arguments that are // pre-lowered to the correct types. This all depends upon WideVT not @@ -3505,16 +3567,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { SDValue Args[] = { HiLHS, LHS, HiRHS, RHS }; Ret = ExpandLibCall(LC, WideVT, Args, 4, isSigned, dl); } - BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Ret, - DAG.getIntPtrConstant(0, dl)); - TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Ret, - DAG.getIntPtrConstant(1, dl)); - // Ret is a node with an illegal type. Because such things are not - // generally permitted during this phase of legalization, make sure the - // node has no more uses. The above EXTRACT_ELEMENT nodes should have been - // folded. - assert(Ret->use_empty() && - "Unexpected uses of illegally type from expanded lib call."); + assert(Ret.getOpcode() == ISD::MERGE_VALUES && + "Ret value is a collection of constituent nodes holding result."); + BottomHalf = Ret.getOperand(0); + TopHalf = Ret.getOperand(1); } if (isSigned) { @@ -3790,8 +3846,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { Scalars.push_back(DAG.getNode(Node->getOpcode(), dl, VT.getScalarType(), Ex, Sh)); } - SDValue Result = - DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Scalars); + + SDValue Result = DAG.getBuildVector(Node->getValueType(0), dl, Scalars); ReplaceNode(SDValue(Node, 0), Result); break; } @@ -3830,10 +3886,11 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(Node->getOperand(0)) - .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol("__sync_synchronize", - TLI.getPointerTy(DAG.getDataLayout())), - std::move(Args)); + .setLibCallee( + CallingConv::C, Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol("__sync_synchronize", + TLI.getPointerTy(DAG.getDataLayout())), + std::move(Args)); std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); @@ -3870,10 +3927,10 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(Node->getOperand(0)) - .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol("abort", - TLI.getPointerTy(DAG.getDataLayout())), - std::move(Args)); + .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol( + "abort", TLI.getPointerTy(DAG.getDataLayout())), + std::move(Args)); std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); Results.push_back(CallResult.second); @@ -3890,16 +3947,19 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { RTLIB::FMAX_PPCF128)); break; case ISD::FSQRT: + case ISD::STRICT_FSQRT: Results.push_back(ExpandFPLibCall(Node, RTLIB::SQRT_F32, RTLIB::SQRT_F64, RTLIB::SQRT_F80, RTLIB::SQRT_F128, RTLIB::SQRT_PPCF128)); break; case ISD::FSIN: + case ISD::STRICT_FSIN: Results.push_back(ExpandFPLibCall(Node, RTLIB::SIN_F32, RTLIB::SIN_F64, RTLIB::SIN_F80, RTLIB::SIN_F128, RTLIB::SIN_PPCF128)); break; case ISD::FCOS: + case ISD::STRICT_FCOS: Results.push_back(ExpandFPLibCall(Node, RTLIB::COS_F32, RTLIB::COS_F64, RTLIB::COS_F80, RTLIB::COS_F128, RTLIB::COS_PPCF128)); @@ -3909,26 +3969,31 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { ExpandSinCosLibCall(Node, Results); break; case ISD::FLOG: + case ISD::STRICT_FLOG: Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG_F32, RTLIB::LOG_F64, RTLIB::LOG_F80, RTLIB::LOG_F128, RTLIB::LOG_PPCF128)); break; case ISD::FLOG2: + case ISD::STRICT_FLOG2: Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG2_F32, RTLIB::LOG2_F64, RTLIB::LOG2_F80, RTLIB::LOG2_F128, RTLIB::LOG2_PPCF128)); break; case ISD::FLOG10: + case ISD::STRICT_FLOG10: Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG10_F32, RTLIB::LOG10_F64, RTLIB::LOG10_F80, RTLIB::LOG10_F128, RTLIB::LOG10_PPCF128)); break; case ISD::FEXP: + case ISD::STRICT_FEXP: Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP_F32, RTLIB::EXP_F64, RTLIB::EXP_F80, RTLIB::EXP_F128, RTLIB::EXP_PPCF128)); break; case ISD::FEXP2: + case ISD::STRICT_FEXP2: Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP2_F32, RTLIB::EXP2_F64, RTLIB::EXP2_F80, RTLIB::EXP2_F128, RTLIB::EXP2_PPCF128)); @@ -3949,11 +4014,13 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { RTLIB::CEIL_PPCF128)); break; case ISD::FRINT: + case ISD::STRICT_FRINT: Results.push_back(ExpandFPLibCall(Node, RTLIB::RINT_F32, RTLIB::RINT_F64, RTLIB::RINT_F80, RTLIB::RINT_F128, RTLIB::RINT_PPCF128)); break; case ISD::FNEARBYINT: + case ISD::STRICT_FNEARBYINT: Results.push_back(ExpandFPLibCall(Node, RTLIB::NEARBYINT_F32, RTLIB::NEARBYINT_F64, RTLIB::NEARBYINT_F80, @@ -3968,11 +4035,13 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { RTLIB::ROUND_PPCF128)); break; case ISD::FPOWI: + case ISD::STRICT_FPOWI: Results.push_back(ExpandFPLibCall(Node, RTLIB::POWI_F32, RTLIB::POWI_F64, RTLIB::POWI_F80, RTLIB::POWI_F128, RTLIB::POWI_PPCF128)); break; case ISD::FPOW: + case ISD::STRICT_FPOW: Results.push_back(ExpandFPLibCall(Node, RTLIB::POW_F32, RTLIB::POW_F64, RTLIB::POW_F80, RTLIB::POW_F128, RTLIB::POW_PPCF128)); @@ -4170,6 +4239,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { ReplacedNode(Node); break; } + case ISD::MUL: case ISD::SDIV: case ISD::SREM: case ISD::UDIV: @@ -4424,8 +4494,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { NewOps.push_back(Elt); } - SDValue NewVec = DAG.getNode(ISD::BUILD_VECTOR, SL, MidVT, NewOps); - + SDValue NewVec = DAG.getBuildVector(MidVT, SL, NewOps); Results.push_back(DAG.getNode(ISD::BITCAST, SL, EltVT, NewVec)); break; } @@ -4519,6 +4588,14 @@ void SelectionDAG::Legalize() { AssignTopologicalOrder(); SmallPtrSet<SDNode *, 16> LegalizedNodes; + // Use a delete listener to remove nodes which were deleted during + // legalization from LegalizeNodes. This is needed to handle the situation + // where a new node is allocated by the object pool to the same address of a + // previously deleted node. + DAGNodeDeletedListener DeleteListener( + *this, + [&LegalizedNodes](SDNode *N, SDNode *E) { LegalizedNodes.erase(N); }); + SelectionDAGLegalize Legalizer(*this, LegalizedNodes); // Visit all the nodes. We start in topological order, so that we see diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 72b56d8..eaf177d 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -72,7 +72,7 @@ bool DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { case ISD::BUILD_PAIR: R = SoftenFloatRes_BUILD_PAIR(N); break; case ISD::ConstantFP: R = SoftenFloatRes_ConstantFP(N, ResNo); break; case ISD::EXTRACT_VECTOR_ELT: - R = SoftenFloatRes_EXTRACT_VECTOR_ELT(N); break; + R = SoftenFloatRes_EXTRACT_VECTOR_ELT(N, ResNo); break; case ISD::FABS: R = SoftenFloatRes_FABS(N, ResNo); break; case ISD::FMINNUM: R = SoftenFloatRes_FMINNUM(N); break; case ISD::FMAXNUM: R = SoftenFloatRes_FMAXNUM(N); break; @@ -112,15 +112,15 @@ bool DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { case ISD::VAARG: R = SoftenFloatRes_VAARG(N); break; } - // If R is null, the sub-method took care of registering the result. - if (R.getNode()) { + if (R.getNode() && R.getNode() != N) { SetSoftenedFloat(SDValue(N, ResNo), R); - ReplaceSoftenFloatResult(N, ResNo, R); + // Return true only if the node is changed, assuming that the operands + // are also converted when necessary. + return true; } - // Return true only if the node is changed, - // assuming that the operands are also converted when necessary. + // Otherwise, return false to tell caller to scan operands. - return R.getNode() && R.getNode() != N; + return false; } SDValue DAGTypeLegalizer::SoftenFloatRes_BITCAST(SDNode *N, unsigned ResNo) { @@ -171,7 +171,10 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo) { } } -SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N) { +SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N, unsigned ResNo) { + // When LegalInHWReg, keep the extracted value in register. + if (isLegalInHWReg(N->getValueType(ResNo))) + return SDValue(N, ResNo); SDValue NewOp = BitConvertVectorToIntegerVector(N->getOperand(0)); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), NewOp.getValueType().getVectorElementType(), @@ -459,7 +462,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) { if (Op.getValueType() == MVT::f16 && N->getValueType(0) != MVT::f32) { Op = DAG.getNode(ISD::FP_EXTEND, SDLoc(N), MVT::f32, Op); if (getTypeAction(MVT::f32) == TargetLowering::TypeSoftenFloat) - SoftenFloatResult(Op.getNode(), 0); + AddToWorklist(Op.getNode()); } if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat) { @@ -472,8 +475,6 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) { } RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0)); - if (getTypeAction(Op.getValueType()) == TargetLowering::TypeSoftenFloat) - Op = GetSoftenedFloat(Op); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!"); return TLI.makeLibCall(DAG, LC, NVT, Op, false, SDLoc(N)).first; } @@ -752,12 +753,17 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) { llvm_unreachable("Do not know how to soften this operator's operand!"); case ISD::BITCAST: Res = SoftenFloatOp_BITCAST(N); break; + case ISD::CopyToReg: Res = SoftenFloatOp_COPY_TO_REG(N); break; case ISD::BR_CC: Res = SoftenFloatOp_BR_CC(N); break; + case ISD::FABS: Res = SoftenFloatOp_FABS(N); break; + case ISD::FCOPYSIGN: Res = SoftenFloatOp_FCOPYSIGN(N); break; + case ISD::FNEG: Res = SoftenFloatOp_FNEG(N); break; case ISD::FP_EXTEND: Res = SoftenFloatOp_FP_EXTEND(N); break; case ISD::FP_TO_FP16: // Same as FP_ROUND for softening purposes case ISD::FP_ROUND: Res = SoftenFloatOp_FP_ROUND(N); break; case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: Res = SoftenFloatOp_FP_TO_XINT(N); break; + case ISD::SELECT: Res = SoftenFloatOp_SELECT(N); break; case ISD::SELECT_CC: Res = SoftenFloatOp_SELECT_CC(N); break; case ISD::SETCC: Res = SoftenFloatOp_SETCC(N); break; case ISD::STORE: @@ -790,9 +796,9 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) { bool DAGTypeLegalizer::CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo) { if (!isLegalInHWReg(N->getOperand(OpNo).getValueType())) return false; - // When the operand type can be kept in registers, SoftenFloatResult - // will call ReplaceValueWith to replace all references and we can - // skip softening this operand. + + // When the operand type can be kept in registers there is nothing to do for + // the following opcodes. switch (N->getOperand(OpNo).getOpcode()) { case ISD::BITCAST: case ISD::ConstantFP: @@ -806,18 +812,12 @@ bool DAGTypeLegalizer::CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo) { case ISD::SELECT_CC: return true; } - // For some opcodes, SoftenFloatResult handles all conversion of softening - // and replacing operands, so that there is no need to soften operands - // again, although such opcode could be scanned for other illegal operands. + switch (N->getOpcode()) { - case ISD::ConstantFP: - case ISD::CopyFromReg: - case ISD::CopyToReg: - case ISD::FABS: - case ISD::FCOPYSIGN: - case ISD::FNEG: - case ISD::Register: - case ISD::SELECT: + case ISD::ConstantFP: // Leaf node. + case ISD::CopyFromReg: // Operand is a register that we know to be left + // unchanged by SoftenFloatResult(). + case ISD::Register: // Leaf node. return true; } return false; @@ -828,6 +828,21 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_BITCAST(SDNode *N) { GetSoftenedFloat(N->getOperand(0))); } +SDValue DAGTypeLegalizer::SoftenFloatOp_COPY_TO_REG(SDNode *N) { + SDValue Op1 = GetSoftenedFloat(N->getOperand(1)); + SDValue Op2 = GetSoftenedFloat(N->getOperand(2)); + + if (Op1 == N->getOperand(1) && Op2 == N->getOperand(2)) + return SDValue(); + + if (N->getNumOperands() == 3) + return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op1, Op2), 0); + + return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op1, Op2, + N->getOperand(3)), + 0); +} + SDValue DAGTypeLegalizer::SoftenFloatOp_FP_EXTEND(SDNode *N) { // If we get here, the result must be legal but the source illegal. EVT SVT = N->getOperand(0).getValueType(); @@ -883,6 +898,34 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) { 0); } +SDValue DAGTypeLegalizer::SoftenFloatOp_FABS(SDNode *N) { + SDValue Op = GetSoftenedFloat(N->getOperand(0)); + + if (Op == N->getOperand(0)) + return SDValue(); + + return SDValue(DAG.UpdateNodeOperands(N, Op), 0); +} + +SDValue DAGTypeLegalizer::SoftenFloatOp_FCOPYSIGN(SDNode *N) { + SDValue Op0 = GetSoftenedFloat(N->getOperand(0)); + SDValue Op1 = GetSoftenedFloat(N->getOperand(1)); + + if (Op0 == N->getOperand(0) && Op1 == N->getOperand(1)) + return SDValue(); + + return SDValue(DAG.UpdateNodeOperands(N, Op0, Op1), 0); +} + +SDValue DAGTypeLegalizer::SoftenFloatOp_FNEG(SDNode *N) { + SDValue Op = GetSoftenedFloat(N->getOperand(0)); + + if (Op == N->getOperand(0)) + return SDValue(); + + return SDValue(DAG.UpdateNodeOperands(N, Op), 0); +} + SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT(SDNode *N) { bool Signed = N->getOpcode() == ISD::FP_TO_SINT; EVT SVT = N->getOperand(0).getValueType(); @@ -912,6 +955,17 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT(SDNode *N) { return DAG.getNode(ISD::TRUNCATE, dl, RVT, Res); } +SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT(SDNode *N) { + SDValue Op1 = GetSoftenedFloat(N->getOperand(1)); + SDValue Op2 = GetSoftenedFloat(N->getOperand(2)); + + if (Op1 == N->getOperand(1) && Op2 == N->getOperand(2)) + return SDValue(); + + return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op1, Op2), + 0); +} + SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) { SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1); ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get(); @@ -1054,15 +1108,15 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) { void DAGTypeLegalizer::ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); - assert(NVT.getSizeInBits() == integerPartWidth && + assert(NVT.getSizeInBits() == 64 && "Do not know how to expand this float constant!"); APInt C = cast<ConstantFPSDNode>(N)->getValueAPF().bitcastToAPInt(); SDLoc dl(N); Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT), - APInt(integerPartWidth, C.getRawData()[1])), + APInt(64, C.getRawData()[1])), dl, NVT); Hi = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT), - APInt(integerPartWidth, C.getRawData()[0])), + APInt(64, C.getRawData()[0])), dl, NVT); } diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index dc436ce..75fec7b 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -21,6 +21,7 @@ #include "LegalizeTypes.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -134,6 +135,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::SMULO: case ISD::UMULO: Res = PromoteIntRes_XMULO(N, ResNo); break; + case ISD::ADDCARRY: + case ISD::SUBCARRY: Res = PromoteIntRes_ADDSUBCARRY(N, ResNo); break; + case ISD::ATOMIC_LOAD: Res = PromoteIntRes_Atomic0(cast<AtomicSDNode>(N)); break; @@ -510,9 +514,14 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) { // Simply change the return type of the boolean result. EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(1)); EVT ValueVTs[] = { N->getValueType(0), NVT }; - SDValue Ops[] = { N->getOperand(0), N->getOperand(1) }; + SDValue Ops[3] = { N->getOperand(0), N->getOperand(1) }; + unsigned NumOps = N->getNumOperands(); + assert(NumOps <= 3 && "Too many operands"); + if (NumOps == 3) + Ops[2] = N->getOperand(2); + SDValue Res = DAG.getNode(N->getOpcode(), SDLoc(N), - DAG.getVTList(ValueVTs), Ops); + DAG.getVTList(ValueVTs), makeArrayRef(Ops, NumOps)); // Modified the sum result - switch anything that used the old sum to use // the new one. @@ -606,9 +615,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SETCC(SDNode *N) { SDValue SetCC = DAG.getNode(N->getOpcode(), dl, SVT, LHS, RHS, N->getOperand(2)); - assert(NVT.bitsLE(SVT) && "Integer type overpromoted?"); // Convert to the expected type. - return DAG.getNode(ISD::TRUNCATE, dl, NVT, SetCC); + return DAG.getSExtOrTrunc(SetCC, dl, NVT); } SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N) { @@ -690,7 +698,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) { case TargetLowering::TypePromoteInteger: Res = GetPromotedInteger(InOp); break; - case TargetLowering::TypeSplitVector: + case TargetLowering::TypeSplitVector: { EVT InVT = InOp.getValueType(); assert(InVT.isVector() && "Cannot split scalar types"); unsigned NumElts = InVT.getVectorNumElements(); @@ -709,6 +717,26 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) { return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, EOp1, EOp2); } + case TargetLowering::TypeWidenVector: { + SDValue WideInOp = GetWidenedVector(InOp); + + // Truncate widened InOp. + unsigned NumElem = WideInOp.getValueType().getVectorNumElements(); + EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), + N->getValueType(0).getScalarType(), NumElem); + SDValue WideTrunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, WideInOp); + + // Zero extend so that the elements are of same type as those of NVT + EVT ExtVT = EVT::getVectorVT(*DAG.getContext(), NVT.getVectorElementType(), + NumElem); + SDValue WideExt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, WideTrunc); + + // Extract the low NVT subvector. + MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); + SDValue ZeroIdx = DAG.getConstant(0, dl, IdxTy); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, WideExt, ZeroIdx); + } + } // Truncate to NVT instead of VT return DAG.getNode(ISD::TRUNCATE, dl, NVT, Res); @@ -742,6 +770,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo) { return Res; } +SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBCARRY(SDNode *N, unsigned ResNo) { + if (ResNo == 1) + return PromoteIntRes_Overflow(N); + llvm_unreachable("Not implemented"); +} + SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) { // Promote the overflow bit trivially. if (ResNo == 1) @@ -904,6 +938,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::SRL: case ISD::ROTL: case ISD::ROTR: Res = PromoteIntOp_Shift(N); break; + + case ISD::ADDCARRY: + case ISD::SUBCARRY: Res = PromoteIntOp_ADDSUBCARRY(N, OpNo); break; } // If the result is null, the sub-method took care of registering results etc. @@ -1089,6 +1126,10 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) { SDValue Cond = N->getOperand(0); EVT OpTy = N->getOperand(1).getValueType(); + if (N->getOpcode() == ISD::VSELECT) + if (SDValue Res = WidenVSELECTAndMask(N)) + return Res; + // Promote all the way up to the canonical SetCC type. EVT OpVT = N->getOpcode() == ISD::SELECT ? OpTy.getScalarType() : OpTy; Cond = PromoteTargetBoolean(Cond, OpVT); @@ -1252,6 +1293,30 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ZERO_EXTEND(SDNode *N) { N->getOperand(0).getValueType().getScalarType()); } +SDValue DAGTypeLegalizer::PromoteIntOp_ADDSUBCARRY(SDNode *N, unsigned OpNo) { + assert(OpNo == 2 && "Don't know how to promote this operand!"); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue Carry = N->getOperand(2); + SDLoc DL(N); + + auto VT = getSetCCResultType(LHS.getValueType()); + TargetLoweringBase::BooleanContent BoolType = TLI.getBooleanContents(VT); + switch (BoolType) { + case TargetLoweringBase::UndefinedBooleanContent: + Carry = DAG.getAnyExtOrTrunc(Carry, DL, VT); + break; + case TargetLoweringBase::ZeroOrOneBooleanContent: + Carry = DAG.getZExtOrTrunc(Carry, DL, VT); + break; + case TargetLoweringBase::ZeroOrNegativeOneBooleanContent: + Carry = DAG.getSExtOrTrunc(Carry, DL, VT); + break; + } + + return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, Carry), 0); +} //===----------------------------------------------------------------------===// // Integer Result Expansion @@ -1371,6 +1436,9 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) { case ISD::ADDE: case ISD::SUBE: ExpandIntRes_ADDSUBE(N, Lo, Hi); break; + case ISD::ADDCARRY: + case ISD::SUBCARRY: ExpandIntRes_ADDSUBCARRY(N, Lo, Hi); break; + case ISD::SHL: case ISD::SRA: case ISD::SRL: ExpandIntRes_Shift(N, Lo, Hi); break; @@ -1501,11 +1569,11 @@ ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); APInt HighBitMask = APInt::getHighBitsSet(ShBits, ShBits - Log2_32(NVTBits)); - APInt KnownZero, KnownOne; - DAG.computeKnownBits(N->getOperand(1), KnownZero, KnownOne); + KnownBits Known; + DAG.computeKnownBits(N->getOperand(1), Known); // If we don't know anything about the high bits, exit. - if (((KnownZero|KnownOne) & HighBitMask) == 0) + if (((Known.Zero|Known.One) & HighBitMask) == 0) return false; // Get the incoming operand to be shifted. @@ -1514,7 +1582,7 @@ ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) { // If we know that any of the high bits of the shift amount are one, then we // can do this as a couple of simple shifts. - if (KnownOne.intersects(HighBitMask)) { + if (Known.One.intersects(HighBitMask)) { // Mask out the high bit, which we know is set. Amt = DAG.getNode(ISD::AND, dl, ShTy, Amt, DAG.getConstant(~HighBitMask, dl, ShTy)); @@ -1539,7 +1607,7 @@ ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) { // If we know that all of the high bits of the shift amount are zero, then we // can do this as a couple of simple shifts. - if ((KnownZero & HighBitMask) == HighBitMask) { + if (HighBitMask.isSubsetOf(Known.Zero)) { // Calculate 31-x. 31 is used instead of 32 to avoid creating an undefined // shift if x is zero. We can use XOR here because x is known to be smaller // than 32. @@ -1714,6 +1782,23 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N, SDValue LoOps[2] = { LHSL, RHSL }; SDValue HiOps[3] = { LHSH, RHSH }; + bool HasOpCarry = TLI.isOperationLegalOrCustom( + N->getOpcode() == ISD::ADD ? ISD::ADDCARRY : ISD::SUBCARRY, + TLI.getTypeToExpandTo(*DAG.getContext(), NVT)); + if (HasOpCarry) { + SDVTList VTList = DAG.getVTList(NVT, getSetCCResultType(NVT)); + if (N->getOpcode() == ISD::ADD) { + Lo = DAG.getNode(ISD::UADDO, dl, VTList, LoOps); + HiOps[2] = Lo.getValue(1); + Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, HiOps); + } else { + Lo = DAG.getNode(ISD::USUBO, dl, VTList, LoOps); + HiOps[2] = Lo.getValue(1); + Hi = DAG.getNode(ISD::SUBCARRY, dl, VTList, HiOps); + } + return; + } + // Do not generate ADDC/ADDE or SUBC/SUBE if the target does not support // them. TODO: Teach operation legalization how to expand unsupported // ADDC/ADDE/SUBC/SUBE. The problem is that these operations generate @@ -1742,9 +1827,11 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N, TLI.isOperationLegalOrCustom(N->getOpcode() == ISD::ADD ? ISD::UADDO : ISD::USUBO, TLI.getTypeToExpandTo(*DAG.getContext(), NVT)); + TargetLoweringBase::BooleanContent BoolType = TLI.getBooleanContents(NVT); + if (hasOVF) { - SDVTList VTList = DAG.getVTList(NVT, NVT); - TargetLoweringBase::BooleanContent BoolType = TLI.getBooleanContents(NVT); + EVT OvfVT = getSetCCResultType(NVT); + SDVTList VTList = DAG.getVTList(NVT, OvfVT); int RevOpc; if (N->getOpcode() == ISD::ADD) { RevOpc = ISD::SUB; @@ -1759,12 +1846,14 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N, switch (BoolType) { case TargetLoweringBase::UndefinedBooleanContent: - OVF = DAG.getNode(ISD::AND, dl, NVT, DAG.getConstant(1, dl, NVT), OVF); + OVF = DAG.getNode(ISD::AND, dl, OvfVT, DAG.getConstant(1, dl, OvfVT), OVF); LLVM_FALLTHROUGH; case TargetLoweringBase::ZeroOrOneBooleanContent: + OVF = DAG.getZExtOrTrunc(OVF, dl, NVT); Hi = DAG.getNode(N->getOpcode(), dl, NVT, Hi, OVF); break; case TargetLoweringBase::ZeroOrNegativeOneBooleanContent: + OVF = DAG.getSExtOrTrunc(OVF, dl, NVT); Hi = DAG.getNode(RevOpc, dl, NVT, Hi, OVF); } return; @@ -1775,6 +1864,13 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N, Hi = DAG.getNode(ISD::ADD, dl, NVT, makeArrayRef(HiOps, 2)); SDValue Cmp1 = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, LoOps[0], ISD::SETULT); + + if (BoolType == TargetLoweringBase::ZeroOrOneBooleanContent) { + SDValue Carry = DAG.getZExtOrTrunc(Cmp1, dl, NVT); + Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry); + return; + } + SDValue Carry1 = DAG.getSelect(dl, NVT, Cmp1, DAG.getConstant(1, dl, NVT), DAG.getConstant(0, dl, NVT)); @@ -1789,9 +1885,14 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N, SDValue Cmp = DAG.getSetCC(dl, getSetCCResultType(LoOps[0].getValueType()), LoOps[0], LoOps[1], ISD::SETULT); - SDValue Borrow = DAG.getSelect(dl, NVT, Cmp, - DAG.getConstant(1, dl, NVT), - DAG.getConstant(0, dl, NVT)); + + SDValue Borrow; + if (BoolType == TargetLoweringBase::ZeroOrOneBooleanContent) + Borrow = DAG.getZExtOrTrunc(Cmp, dl, NVT); + else + Borrow = DAG.getSelect(dl, NVT, Cmp, DAG.getConstant(1, dl, NVT), + DAG.getConstant(0, dl, NVT)); + Hi = DAG.getNode(ISD::SUB, dl, NVT, Hi, Borrow); } } @@ -1842,6 +1943,71 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUBE(SDNode *N, ReplaceValueWith(SDValue(N, 1), Hi.getValue(1)); } +void DAGTypeLegalizer::ExpandIntRes_UADDSUBO(SDNode *N, + SDValue &Lo, SDValue &Hi) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDLoc dl(N); + + SDValue Ovf; + + bool HasOpCarry = TLI.isOperationLegalOrCustom( + N->getOpcode() == ISD::ADD ? ISD::ADDCARRY : ISD::SUBCARRY, + TLI.getTypeToExpandTo(*DAG.getContext(), LHS.getValueType())); + + if (HasOpCarry) { + // Expand the subcomponents. + SDValue LHSL, LHSH, RHSL, RHSH; + GetExpandedInteger(LHS, LHSL, LHSH); + GetExpandedInteger(RHS, RHSL, RHSH); + SDVTList VTList = DAG.getVTList(LHSL.getValueType(), N->getValueType(1)); + SDValue LoOps[2] = { LHSL, RHSL }; + SDValue HiOps[3] = { LHSH, RHSH }; + + unsigned Opc = N->getOpcode() == ISD::UADDO ? ISD::ADDCARRY : ISD::SUBCARRY; + Lo = DAG.getNode(N->getOpcode(), dl, VTList, LoOps); + HiOps[2] = Lo.getValue(1); + Hi = DAG.getNode(Opc, dl, VTList, HiOps); + + Ovf = Hi.getValue(1); + } else { + // Expand the result by simply replacing it with the equivalent + // non-overflow-checking operation. + auto Opc = N->getOpcode() == ISD::UADDO ? ISD::ADD : ISD::SUB; + SDValue Sum = DAG.getNode(Opc, dl, LHS.getValueType(), LHS, RHS); + SplitInteger(Sum, Lo, Hi); + + // Calculate the overflow: addition overflows iff a + b < a, and subtraction + // overflows iff a - b > a. + auto Cond = N->getOpcode() == ISD::UADDO ? ISD::SETULT : ISD::SETUGT; + Ovf = DAG.getSetCC(dl, N->getValueType(1), Sum, LHS, Cond); + } + + // Legalized the flag result - switch anything that used the old flag to + // use the new one. + ReplaceValueWith(SDValue(N, 1), Ovf); +} + +void DAGTypeLegalizer::ExpandIntRes_ADDSUBCARRY(SDNode *N, + SDValue &Lo, SDValue &Hi) { + // Expand the subcomponents. + SDValue LHSL, LHSH, RHSL, RHSH; + SDLoc dl(N); + GetExpandedInteger(N->getOperand(0), LHSL, LHSH); + GetExpandedInteger(N->getOperand(1), RHSL, RHSH); + SDVTList VTList = DAG.getVTList(LHSL.getValueType(), N->getValueType(1)); + SDValue LoOps[3] = { LHSL, RHSL, N->getOperand(2) }; + SDValue HiOps[3] = { LHSH, RHSH, SDValue() }; + + Lo = DAG.getNode(N->getOpcode(), dl, VTList, LoOps); + HiOps[2] = Lo.getValue(1); + Hi = DAG.getNode(N->getOpcode(), dl, VTList, HiOps); + + // Legalized the flag result - switch anything that used the old flag to + // use the new one. + ReplaceValueWith(SDValue(N, 1), Hi.getValue(1)); +} + void DAGTypeLegalizer::ExpandIntRes_ANY_EXTEND(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); @@ -2508,29 +2674,6 @@ void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N, Hi = DAG.getNode(ISD::TRUNCATE, dl, NVT, Hi); } -void DAGTypeLegalizer::ExpandIntRes_UADDSUBO(SDNode *N, - SDValue &Lo, SDValue &Hi) { - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - SDLoc dl(N); - - // Expand the result by simply replacing it with the equivalent - // non-overflow-checking operation. - SDValue Sum = DAG.getNode(N->getOpcode() == ISD::UADDO ? - ISD::ADD : ISD::SUB, dl, LHS.getValueType(), - LHS, RHS); - SplitInteger(Sum, Lo, Hi); - - // Calculate the overflow: addition overflows iff a + b < a, and subtraction - // overflows iff a - b > a. - SDValue Ofl = DAG.getSetCC(dl, N->getValueType(1), Sum, LHS, - N->getOpcode () == ISD::UADDO ? - ISD::SETULT : ISD::SETUGT); - - // Use the calculated overflow everywhere. - ReplaceValueWith(SDValue(N, 1), Ofl); -} - void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT VT = N->getValueType(0); @@ -2586,24 +2729,25 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N, Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Node = Op; Entry.Ty = ArgTy; - Entry.isSExt = true; - Entry.isZExt = false; + Entry.IsSExt = true; + Entry.IsZExt = false; Args.push_back(Entry); } // Also pass the address of the overflow check. Entry.Node = Temp; Entry.Ty = PtrTy->getPointerTo(); - Entry.isSExt = true; - Entry.isZExt = false; + Entry.IsSExt = true; + Entry.IsZExt = false; Args.push_back(Entry); SDValue Func = DAG.getExternalSymbol(TLI.getLibcallName(LC), PtrVT); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(Chain) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, std::move(Args)) - .setSExtResult(); + CLI.setDebugLoc(dl) + .setChain(Chain) + .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, std::move(Args)) + .setSExtResult(); std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); @@ -2743,6 +2887,7 @@ bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::SELECT_CC: Res = ExpandIntOp_SELECT_CC(N); break; case ISD::SETCC: Res = ExpandIntOp_SETCC(N); break; case ISD::SETCCE: Res = ExpandIntOp_SETCCE(N); break; + case ISD::SETCCCARRY: Res = ExpandIntOp_SETCCCARRY(N); break; case ISD::SINT_TO_FP: Res = ExpandIntOp_SINT_TO_FP(N); break; case ISD::STORE: Res = ExpandIntOp_STORE(cast<StoreSDNode>(N), OpNo); break; case ISD::TRUNCATE: Res = ExpandIntOp_TRUNCATE(N); break; @@ -2877,14 +3022,16 @@ void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS, return; } - // Lower with SETCCE if the target supports it. + // Lower with SETCCE or SETCCCARRY if the target supports it. + EVT HiVT = LHSHi.getValueType(); + EVT ExpandVT = TLI.getTypeToExpandTo(*DAG.getContext(), HiVT); + bool HasSETCCCARRY = TLI.isOperationLegalOrCustom(ISD::SETCCCARRY, ExpandVT); + // FIXME: Make all targets support this, then remove the other lowering. - if (TLI.getOperationAction( - ISD::SETCCE, - TLI.getTypeToExpandTo(*DAG.getContext(), LHSLo.getValueType())) == - TargetLowering::Custom) { - // SETCCE can detect < and >= directly. For > and <=, flip operands and - // condition code. + if (HasSETCCCARRY || + TLI.getOperationAction(ISD::SETCCE, ExpandVT) == TargetLowering::Custom) { + // SETCCE/SETCCCARRY can detect < and >= directly. For > and <=, flip + // operands and condition code. bool FlipOperands = false; switch (CCCode) { case ISD::SETGT: CCCode = ISD::SETLT; FlipOperands = true; break; @@ -2898,27 +3045,28 @@ void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS, std::swap(LHSHi, RHSHi); } // Perform a wide subtraction, feeding the carry from the low part into - // SETCCE. The SETCCE operation is essentially looking at the high part of - // the result of LHS - RHS. It is negative iff LHS < RHS. It is zero or - // positive iff LHS >= RHS. - SDVTList VTList = DAG.getVTList(LHSLo.getValueType(), MVT::Glue); - SDValue LowCmp = DAG.getNode(ISD::SUBC, dl, VTList, LHSLo, RHSLo); - SDValue Res = - DAG.getNode(ISD::SETCCE, dl, getSetCCResultType(LHSLo.getValueType()), - LHSHi, RHSHi, LowCmp.getValue(1), DAG.getCondCode(CCCode)); + // SETCCE/SETCCCARRY. The SETCCE/SETCCCARRY operation is essentially + // looking at the high part of the result of LHS - RHS. It is negative + // iff LHS < RHS. It is zero or positive iff LHS >= RHS. + EVT LoVT = LHSLo.getValueType(); + SDVTList VTList = DAG.getVTList( + LoVT, HasSETCCCARRY ? getSetCCResultType(LoVT) : MVT::Glue); + SDValue LowCmp = DAG.getNode(HasSETCCCARRY ? ISD::USUBO : ISD::SUBC, dl, + VTList, LHSLo, RHSLo); + SDValue Res = DAG.getNode(HasSETCCCARRY ? ISD::SETCCCARRY : ISD::SETCCE, dl, + getSetCCResultType(HiVT), LHSHi, RHSHi, + LowCmp.getValue(1), DAG.getCondCode(CCCode)); NewLHS = Res; NewRHS = SDValue(); return; } - NewLHS = TLI.SimplifySetCC(getSetCCResultType(LHSHi.getValueType()), - LHSHi, RHSHi, ISD::SETEQ, false, - DagCombineInfo, dl); + NewLHS = TLI.SimplifySetCC(getSetCCResultType(HiVT), LHSHi, RHSHi, ISD::SETEQ, + false, DagCombineInfo, dl); if (!NewLHS.getNode()) - NewLHS = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()), - LHSHi, RHSHi, ISD::SETEQ); - NewLHS = DAG.getSelect(dl, LoCmp.getValueType(), - NewLHS, LoCmp, HiCmp); + NewLHS = + DAG.getSetCC(dl, getSetCCResultType(HiVT), LHSHi, RHSHi, ISD::SETEQ); + NewLHS = DAG.getSelect(dl, LoCmp.getValueType(), NewLHS, LoCmp, HiCmp); NewRHS = SDValue(); } @@ -2971,8 +3119,8 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SETCC(SDNode *N) { } // Otherwise, update N to have the operands specified. - return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS, - DAG.getCondCode(CCCode)), 0); + return SDValue( + DAG.UpdateNodeOperands(N, NewLHS, NewRHS, DAG.getCondCode(CCCode)), 0); } SDValue DAGTypeLegalizer::ExpandIntOp_SETCCE(SDNode *N) { @@ -2993,6 +3141,24 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SETCCE(SDNode *N) { LowCmp.getValue(1), Cond); } +SDValue DAGTypeLegalizer::ExpandIntOp_SETCCCARRY(SDNode *N) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue Carry = N->getOperand(2); + SDValue Cond = N->getOperand(3); + SDLoc dl = SDLoc(N); + + SDValue LHSLo, LHSHi, RHSLo, RHSHi; + GetExpandedInteger(LHS, LHSLo, LHSHi); + GetExpandedInteger(RHS, RHSLo, RHSHi); + + // Expand to a SUBE for the low part and a smaller SETCCCARRY for the high. + SDVTList VTList = DAG.getVTList(LHSLo.getValueType(), Carry.getValueType()); + SDValue LowCmp = DAG.getNode(ISD::SUBCARRY, dl, VTList, LHSLo, RHSLo, Carry); + return DAG.getNode(ISD::SETCCCARRY, dl, N->getValueType(0), LHSHi, RHSHi, + LowCmp.getValue(1), Cond); +} + SDValue DAGTypeLegalizer::ExpandIntOp_Shift(SDNode *N) { // The value being shifted is legal, but the shift amount is too big. // It follows that either the result of the shift is undefined, or the @@ -3226,7 +3392,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) { Ops.push_back(Op); } - return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops); + return DAG.getBuildVector(NOutVT, dl, Ops); } @@ -3269,7 +3435,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_VECTOR(SDNode *N) { Ops.push_back(Op); } - return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops); + return DAG.getBuildVector(NOutVT, dl, Ops); } SDValue DAGTypeLegalizer::PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N) { @@ -3317,7 +3483,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CONCAT_VECTORS(SDNode *N) { } } - return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops); + return DAG.getBuildVector(NOutVT, dl, Ops); } SDValue DAGTypeLegalizer::PromoteIntRes_EXTEND_VECTOR_INREG(SDNode *N) { @@ -3420,5 +3586,5 @@ SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) { } } - return DAG.getNode(ISD::BUILD_VECTOR, dl, N->getValueType(0), NewOps); + return DAG.getBuildVector(N->getValueType(0), dl, NewOps); } diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp index cf19d75..001eed9 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp @@ -80,6 +80,7 @@ void DAGTypeLegalizer::PerformExpensiveChecks() { for (unsigned i = 0, e = Node.getNumValues(); i != e; ++i) { SDValue Res(&Node, i); + EVT VT = Res.getValueType(); bool Failed = false; unsigned Mapped = 0; @@ -129,13 +130,17 @@ void DAGTypeLegalizer::PerformExpensiveChecks() { dbgs() << "Unprocessed value in a map!"; Failed = true; } - } else if (isTypeLegal(Res.getValueType()) || IgnoreNodeResults(&Node)) { + } else if (isTypeLegal(VT) || IgnoreNodeResults(&Node)) { if (Mapped > 1) { dbgs() << "Value with legal type was transformed!"; Failed = true; } } else { - if (Mapped == 0) { + // If the value can be kept in HW registers, softening machinery can + // leave it unchanged and don't put it to any map. + if (Mapped == 0 && + !(getTypeAction(VT) == TargetLowering::TypeSoftenFloat && + isLegalInHWReg(VT))) { dbgs() << "Processed value not in any map!"; Failed = true; } else if (Mapped & (Mapped - 1)) { @@ -199,8 +204,7 @@ bool DAGTypeLegalizer::run() { // non-leaves. for (SDNode &Node : DAG.allnodes()) { if (Node.getNumOperands() == 0) { - Node.setNodeId(ReadyToProcess); - Worklist.push_back(&Node); + AddToWorklist(&Node); } else { Node.setNodeId(Unanalyzed); } @@ -331,6 +335,7 @@ ScanOperands: // to the worklist etc. if (NeedsReanalyzing) { assert(N->getNodeId() == ReadyToProcess && "Node ID recalculated?"); + N->setNodeId(NewNode); // Recompute the NodeId and correct processed operands, adding the node to // the worklist if ready. @@ -918,9 +923,9 @@ SDValue DAGTypeLegalizer::BitConvertVectorToIntegerVector(SDValue Op) { assert(Op.getValueType().isVector() && "Only applies to vectors!"); unsigned EltWidth = Op.getScalarValueSizeInBits(); EVT EltNVT = EVT::getIntegerVT(*DAG.getContext(), EltWidth); - unsigned NumElts = Op.getValueType().getVectorNumElements(); + auto EltCnt = Op.getValueType().getVectorElementCount(); return DAG.getNode(ISD::BITCAST, SDLoc(Op), - EVT::getVectorVT(*DAG.getContext(), EltNVT, NumElts), Op); + EVT::getVectorVT(*DAG.getContext(), EltNVT, EltCnt), Op); } SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op, @@ -1077,8 +1082,8 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC, SDNode *Node, Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Node = Node->getOperand(i); Entry.Ty = ArgTy; - Entry.isSExt = isSigned; - Entry.isZExt = !isSigned; + Entry.IsSExt = isSigned; + Entry.IsZExt = !isSigned; Args.push_back(Entry); } SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -1087,9 +1092,12 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC, SDNode *Node, Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext()); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(SDLoc(Node)).setChain(InChain) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) - .setSExtResult(isSigned).setZExtResult(!isSigned); + CLI.setDebugLoc(SDLoc(Node)) + .setChain(InChain) + .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, + std::move(Args)) + .setSExtResult(isSigned) + .setZExtResult(!isSigned); std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index ec55662..c46d1b0 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -191,6 +191,11 @@ private: void SplitInteger(SDValue Op, EVT LoVT, EVT HiVT, SDValue &Lo, SDValue &Hi); + void AddToWorklist(SDNode *N) { + N->setNodeId(ReadyToProcess); + Worklist.push_back(N); + } + //===--------------------------------------------------------------------===// // Integer Promotion Support: LegalizeIntegerTypes.cpp //===--------------------------------------------------------------------===// @@ -274,6 +279,7 @@ private: SDValue PromoteIntRes_SRL(SDNode *N); SDValue PromoteIntRes_TRUNCATE(SDNode *N); SDValue PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo); + SDValue PromoteIntRes_ADDSUBCARRY(SDNode *N, unsigned ResNo); SDValue PromoteIntRes_UNDEF(SDNode *N); SDValue PromoteIntRes_VAARG(SDNode *N); SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo); @@ -306,6 +312,7 @@ private: SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo); + SDValue PromoteIntOp_ADDSUBCARRY(SDNode *N, unsigned OpNo); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -345,6 +352,7 @@ private: void ExpandIntRes_ADDSUB (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_ADDSUBC (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_ADDSUBE (SDNode *N, SDValue &Lo, SDValue &Hi); + void ExpandIntRes_ADDSUBCARRY (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_BITREVERSE (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_BSWAP (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_MUL (SDNode *N, SDValue &Lo, SDValue &Hi); @@ -373,6 +381,7 @@ private: SDValue ExpandIntOp_SELECT_CC(SDNode *N); SDValue ExpandIntOp_SETCC(SDNode *N); SDValue ExpandIntOp_SETCCE(SDNode *N); + SDValue ExpandIntOp_SETCCCARRY(SDNode *N); SDValue ExpandIntOp_Shift(SDNode *N); SDValue ExpandIntOp_SINT_TO_FP(SDNode *N); SDValue ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo); @@ -407,23 +416,13 @@ private: } void SetSoftenedFloat(SDValue Op, SDValue Result); - // Call ReplaceValueWith(SDValue(N, ResNo), Res) if necessary. - void ReplaceSoftenFloatResult(SDNode *N, unsigned ResNo, SDValue &NewRes) { - // When the result type can be kept in HW registers, the converted - // NewRes node could have the same type. We can save the effort in - // cloning every user of N in SoftenFloatOperand or other legalization functions, - // by calling ReplaceValueWith here to update all users. - if (NewRes.getNode() != N && isLegalInHWReg(N->getValueType(ResNo))) - ReplaceValueWith(SDValue(N, ResNo), NewRes); - } - // Convert Float Results to Integer for Non-HW-supported Operations. bool SoftenFloatResult(SDNode *N, unsigned ResNo); SDValue SoftenFloatRes_MERGE_VALUES(SDNode *N, unsigned ResNo); SDValue SoftenFloatRes_BITCAST(SDNode *N, unsigned ResNo); SDValue SoftenFloatRes_BUILD_PAIR(SDNode *N); SDValue SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo); - SDValue SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N); + SDValue SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N, unsigned ResNo); SDValue SoftenFloatRes_FABS(SDNode *N, unsigned ResNo); SDValue SoftenFloatRes_FMINNUM(SDNode *N); SDValue SoftenFloatRes_FMAXNUM(SDNode *N); @@ -462,17 +461,23 @@ private: SDValue SoftenFloatRes_XINT_TO_FP(SDNode *N); // Return true if we can skip softening the given operand or SDNode because - // it was soften before by SoftenFloatResult and references to the operand - // were replaced by ReplaceValueWith. + // either it was soften before by SoftenFloatResult and references to the + // operand were replaced by ReplaceValueWith or it's value type is legal in HW + // registers and the operand can be left unchanged. bool CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo); // Convert Float Operand to Integer for Non-HW-supported Operations. bool SoftenFloatOperand(SDNode *N, unsigned OpNo); SDValue SoftenFloatOp_BITCAST(SDNode *N); + SDValue SoftenFloatOp_COPY_TO_REG(SDNode *N); SDValue SoftenFloatOp_BR_CC(SDNode *N); + SDValue SoftenFloatOp_FABS(SDNode *N); + SDValue SoftenFloatOp_FCOPYSIGN(SDNode *N); + SDValue SoftenFloatOp_FNEG(SDNode *N); SDValue SoftenFloatOp_FP_EXTEND(SDNode *N); SDValue SoftenFloatOp_FP_ROUND(SDNode *N); SDValue SoftenFloatOp_FP_TO_XINT(SDNode *N); + SDValue SoftenFloatOp_SELECT(SDNode *N); SDValue SoftenFloatOp_SELECT_CC(SDNode *N); SDValue SoftenFloatOp_SETCC(SDNode *N); SDValue SoftenFloatOp_STORE(SDNode *N, unsigned OpNo); @@ -597,6 +602,7 @@ private: SDValue ScalarizeVecRes_TernaryOp(SDNode *N); SDValue ScalarizeVecRes_UnaryOp(SDNode *N); SDValue ScalarizeVecRes_InregOp(SDNode *N); + SDValue ScalarizeVecRes_VecInregOp(SDNode *N); SDValue ScalarizeVecRes_BITCAST(SDNode *N); SDValue ScalarizeVecRes_BUILD_VECTOR(SDNode *N); @@ -621,6 +627,7 @@ private: SDValue ScalarizeVecOp_CONCAT_VECTORS(SDNode *N); SDValue ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N); SDValue ScalarizeVecOp_VSELECT(SDNode *N); + SDValue ScalarizeVecOp_VSETCC(SDNode *N); SDValue ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo); SDValue ScalarizeVecOp_FP_ROUND(SDNode *N, unsigned OpNo); @@ -666,12 +673,14 @@ private: // Vector Operand Splitting: <128 x ty> -> 2 x <64 x ty>. bool SplitVectorOperand(SDNode *N, unsigned OpNo); SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo); + SDValue SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo); SDValue SplitVecOp_UnaryOp(SDNode *N); SDValue SplitVecOp_TruncateHelper(SDNode *N); SDValue SplitVecOp_BITCAST(SDNode *N); SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N); SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N); + SDValue SplitVecOp_ExtVecInRegOp(SDNode *N); SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo); @@ -713,6 +722,7 @@ private: SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N); SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N); SDValue WidenVecRes_SELECT(SDNode* N); + SDValue WidenVSELECTAndMask(SDNode *N); SDValue WidenVecRes_SELECT_CC(SDNode* N); SDValue WidenVecRes_SETCC(SDNode* N); SDValue WidenVecRes_UNDEF(SDNode *N); @@ -782,6 +792,13 @@ private: /// By default, the vector will be widened with undefined values. SDValue ModifyToType(SDValue InOp, EVT NVT, bool FillWithZeroes = false); + /// Return a mask of vector type MaskVT to replace InMask. Also adjust + /// MaskVT to ToMaskVT if needed with vector extension or truncation. + SDValue convertMask(SDValue InMask, EVT MaskVT, EVT ToMaskVT); + + /// Get the target mask VT, and widen if needed. + EVT getSETCCWidenedResultTy(SDValue SetCC); + //===--------------------------------------------------------------------===// // Generic Splitting: LegalizeTypesGeneric.cpp //===--------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index 3682c32..f330615 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -57,7 +57,7 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) { // Expand the floating point operand only if it was converted to integers. // Otherwise, it is a legal type like f128 that can be saved in a register. auto SoftenedOp = GetSoftenedFloat(InOp); - if (SoftenedOp == InOp) + if (isLegalInHWReg(SoftenedOp.getValueType())) break; SplitInteger(SoftenedOp, Lo, Hi); Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo); @@ -362,8 +362,8 @@ SDValue DAGTypeLegalizer::ExpandOp_BITCAST(SDNode *N) { SmallVector<SDValue, 8> Ops; IntegerToVector(N->getOperand(0), NumElts, Ops, NVT.getVectorElementType()); - SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, - makeArrayRef(Ops.data(), NumElts)); + SDValue Vec = + DAG.getBuildVector(NVT, dl, makeArrayRef(Ops.data(), NumElts)); return DAG.getNode(ISD::BITCAST, dl, N->getValueType(0), Vec); } @@ -396,10 +396,8 @@ SDValue DAGTypeLegalizer::ExpandOp_BUILD_VECTOR(SDNode *N) { NewElts.push_back(Hi); } - SDValue NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl, - EVT::getVectorVT(*DAG.getContext(), - NewVT, NewElts.size()), - NewElts); + EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NewElts.size()); + SDValue NewVec = DAG.getBuildVector(NewVecVT, dl, NewElts); // Convert the new vector to the old vector type. return DAG.getNode(ISD::BITCAST, dl, VecVT, NewVec); @@ -458,7 +456,7 @@ SDValue DAGTypeLegalizer::ExpandOp_SCALAR_TO_VECTOR(SDNode *N) { SDValue UndefVal = DAG.getUNDEF(Ops[0].getValueType()); for (unsigned i = 1; i < NumElts; ++i) Ops[i] = UndefVal; - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + return DAG.getBuildVector(VT, dl, Ops); } SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) { @@ -512,8 +510,24 @@ void DAGTypeLegalizer::SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo, GetSplitOp(Op, Lo, Hi); } -void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, - SDValue &Hi) { +static std::pair<SDValue, SDValue> SplitVSETCC(const SDNode *N, + SelectionDAG &DAG) { + SDLoc DL(N); + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + + // Split the inputs. + SDValue Lo, Hi, LL, LH, RL, RH; + std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0); + std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1); + + Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2)); + Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2)); + + return std::make_pair(Lo, Hi); +} + +void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue LL, LH, RL, RH, CL, CH; SDLoc dl(N); GetSplitOp(N->getOperand(1), LL, LH); @@ -522,9 +536,16 @@ void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue Cond = N->getOperand(0); CL = CH = Cond; if (Cond.getValueType().isVector()) { + if (SDValue Res = WidenVSELECTAndMask(N)) + std::tie(CL, CH) = DAG.SplitVector(Res->getOperand(0), dl); + // It seems to improve code to generate two narrow SETCCs as opposed to + // splitting a wide result vector. + else if (Cond.getOpcode() == ISD::SETCC) + std::tie(CL, CH) = SplitVSETCC(Cond.getNode(), DAG); // Check if there are already splitted versions of the vector available and // use those instead of splitting the mask operand again. - if (getTypeAction(Cond.getValueType()) == TargetLowering::TypeSplitVector) + else if (getTypeAction(Cond.getValueType()) == + TargetLowering::TypeSplitVector) GetSplitVector(Cond, CL, CH); else std::tie(CL, CH) = DAG.SplitVector(Cond, dl); diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index d4fa20f..9355dbe 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -105,6 +105,7 @@ class VectorLegalizer { SDValue ExpandLoad(SDValue Op); SDValue ExpandStore(SDValue Op); SDValue ExpandFNEG(SDValue Op); + SDValue ExpandFSUB(SDValue Op); SDValue ExpandBITREVERSE(SDValue Op); SDValue ExpandCTLZ(SDValue Op); SDValue ExpandCTTZ_ZERO_UNDEF(SDValue Op); @@ -224,6 +225,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { } return TranslateLegalizeResults(Op, Lowered); } + LLVM_FALLTHROUGH; case TargetLowering::Expand: Changed = true; return LegalizeOp(ExpandLoad(Op)); @@ -621,8 +623,7 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) { } NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); - Value = DAG.getNode(ISD::BUILD_VECTOR, dl, - Op.getNode()->getValueType(0), Vals); + Value = DAG.getBuildVector(Op.getNode()->getValueType(0), dl, Vals); } else { SDValue Scalarized = TLI.scalarizeVectorLoad(LD, DAG); @@ -692,6 +693,8 @@ SDValue VectorLegalizer::Expand(SDValue Op) { return ExpandUINT_TO_FLOAT(Op); case ISD::FNEG: return ExpandFNEG(Op); + case ISD::FSUB: + return ExpandFSUB(Op); case ISD::SETCC: return UnrollVSETCC(Op); case ISD::BITREVERSE: @@ -720,8 +723,6 @@ SDValue VectorLegalizer::ExpandSELECT(SDValue Op) { assert(VT.isVector() && !Mask.getValueType().isVector() && Op1.getValueType() == Op2.getValueType() && "Invalid type"); - unsigned NumElem = VT.getVectorNumElements(); - // If we can't even use the basic vector operations of // AND,OR,XOR, we will have to scalarize the op. // Notice that the operation may be 'promoted' which means that it is @@ -745,8 +746,7 @@ SDValue VectorLegalizer::ExpandSELECT(SDValue Op) { DAG.getConstant(0, DL, BitTy)); // Broadcast the mask so that the entire vector is all-one or all zero. - SmallVector<SDValue, 8> Ops(NumElem, Mask); - Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskTy, Ops); + Mask = DAG.getSplatBuildVector(MaskTy, DL, Mask); // Bitcast the operands to be the same type as the mask. // This is needed when we select between FP types because @@ -1025,6 +1025,18 @@ SDValue VectorLegalizer::ExpandFNEG(SDValue Op) { return DAG.UnrollVectorOp(Op.getNode()); } +SDValue VectorLegalizer::ExpandFSUB(SDValue Op) { + // For floating-point values, (a-b) is the same as a+(-b). If FNEG is legal, + // we can defer this to operation legalization where it will be lowered as + // a+(-b). + EVT VT = Op.getValueType(); + if (TLI.isOperationLegalOrCustom(ISD::FNEG, VT) && + TLI.isOperationLegalOrCustom(ISD::FADD, VT)) + return Op; // Defer to LegalizeDAG + + return DAG.UnrollVectorOp(Op.getNode()); +} + SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) { EVT VT = Op.getValueType(); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); @@ -1102,7 +1114,7 @@ SDValue VectorLegalizer::UnrollVSETCC(SDValue Op) { (EltVT.getSizeInBits()), dl, EltVT), DAG.getConstant(0, dl, EltVT)); } - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + return DAG.getBuildVector(VT, dl, Ops); } } diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 6906f67..6aa3270 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -65,6 +65,11 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::SETCC: R = ScalarizeVecRes_SETCC(N); break; case ISD::UNDEF: R = ScalarizeVecRes_UNDEF(N); break; case ISD::VECTOR_SHUFFLE: R = ScalarizeVecRes_VECTOR_SHUFFLE(N); break; + case ISD::ANY_EXTEND_VECTOR_INREG: + case ISD::SIGN_EXTEND_VECTOR_INREG: + case ISD::ZERO_EXTEND_VECTOR_INREG: + R = ScalarizeVecRes_VecInregOp(N); + break; case ISD::ANY_EXTEND: case ISD::BITREVERSE: case ISD::BSWAP: @@ -97,6 +102,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::TRUNCATE: case ISD::UINT_TO_FP: case ISD::ZERO_EXTEND: + case ISD::FCANONICALIZE: R = ScalarizeVecRes_UnaryOp(N); break; @@ -257,6 +263,34 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_InregOp(SDNode *N) { LHS, DAG.getValueType(ExtVT)); } +SDValue DAGTypeLegalizer::ScalarizeVecRes_VecInregOp(SDNode *N) { + SDLoc DL(N); + SDValue Op = N->getOperand(0); + + EVT OpVT = Op.getValueType(); + EVT OpEltVT = OpVT.getVectorElementType(); + EVT EltVT = N->getValueType(0).getVectorElementType(); + + if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) { + Op = GetScalarizedVector(Op); + } else { + Op = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, DL, OpEltVT, Op, + DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))); + } + + switch (N->getOpcode()) { + case ISD::ANY_EXTEND_VECTOR_INREG: + return DAG.getNode(ISD::ANY_EXTEND, DL, EltVT, Op); + case ISD::SIGN_EXTEND_VECTOR_INREG: + return DAG.getNode(ISD::SIGN_EXTEND, DL, EltVT, Op); + case ISD::ZERO_EXTEND_VECTOR_INREG: + return DAG.getNode(ISD::ZERO_EXTEND, DL, EltVT, Op); + } + + llvm_unreachable("Illegal extend_vector_inreg opcode"); +} + SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) { // If the operand is wider than the vector element type then it is implicitly // truncated. Make that explicit here. @@ -268,7 +302,21 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) { } SDValue DAGTypeLegalizer::ScalarizeVecRes_VSELECT(SDNode *N) { - SDValue Cond = GetScalarizedVector(N->getOperand(0)); + SDValue Cond = N->getOperand(0); + EVT OpVT = Cond.getValueType(); + SDLoc DL(N); + // The vselect result and true/value operands needs scalarizing, but it's + // not a given that the Cond does. For instance, in AVX512 v1i1 is legal. + // See the similar logic in ScalarizeVecRes_VSETCC + if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) { + Cond = GetScalarizedVector(Cond); + } else { + EVT VT = OpVT.getVectorElementType(); + Cond = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, DL, VT, Cond, + DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))); + } + SDValue LHS = GetScalarizedVector(N->getOperand(1)); TargetLowering::BooleanContent ScalarBool = TLI.getBooleanContents(false, false); @@ -436,6 +484,9 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) { case ISD::VSELECT: Res = ScalarizeVecOp_VSELECT(N); break; + case ISD::SETCC: + Res = ScalarizeVecOp_VSETCC(N); + break; case ISD::STORE: Res = ScalarizeVecOp_STORE(cast<StoreSDNode>(N), OpNo); break; @@ -478,7 +529,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) { N->getValueType(0).getScalarType(), Elt); // Revectorize the result so the types line up with what the uses of this // expression expect. - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N->getValueType(0), Op); + return DAG.getBuildVector(N->getValueType(0), SDLoc(N), Op); } /// The vectors to concatenate have length one - use a BUILD_VECTOR instead. @@ -486,20 +537,21 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) { SmallVector<SDValue, 8> Ops(N->getNumOperands()); for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i) Ops[i] = GetScalarizedVector(N->getOperand(i)); - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N->getValueType(0), Ops); + return DAG.getBuildVector(N->getValueType(0), SDLoc(N), Ops); } /// If the input is a vector that needs to be scalarized, it must be <1 x ty>, /// so just return the element, ignoring the index. SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N) { + EVT VT = N->getValueType(0); SDValue Res = GetScalarizedVector(N->getOperand(0)); - if (Res.getValueType() != N->getValueType(0)) - Res = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), N->getValueType(0), - Res); + if (Res.getValueType() != VT) + Res = VT.isFloatingPoint() + ? DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, Res) + : DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Res); return Res; } - /// If the input condition is a vector that needs to be scalarized, it must be /// <1 x i1>, so just convert to a normal ISD::SELECT /// (still with vector output type since that was acceptable if we got here). @@ -511,6 +563,36 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_VSELECT(SDNode *N) { N->getOperand(2)); } +/// If the operand is a vector that needs to be scalarized then the +/// result must be v1i1, so just convert to a scalar SETCC and wrap +/// with a scalar_to_vector since the res type is legal if we got here +SDValue DAGTypeLegalizer::ScalarizeVecOp_VSETCC(SDNode *N) { + assert(N->getValueType(0).isVector() && + N->getOperand(0).getValueType().isVector() && + "Operand types must be vectors"); + assert(N->getValueType(0) == MVT::v1i1 && "Expected v1i1 type"); + + EVT VT = N->getValueType(0); + SDValue LHS = GetScalarizedVector(N->getOperand(0)); + SDValue RHS = GetScalarizedVector(N->getOperand(1)); + + EVT OpVT = N->getOperand(0).getValueType(); + EVT NVT = VT.getVectorElementType(); + SDLoc DL(N); + // Turn it into a scalar SETCC. + SDValue Res = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, + N->getOperand(2)); + + // Vectors may have a different boolean contents to scalars. Promote the + // value appropriately. + ISD::NodeType ExtendCode = + TargetLowering::getExtendForContent(TLI.getBooleanContents(OpVT)); + + Res = DAG.getNode(ExtendCode, DL, NVT, Res); + + return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Res); +} + /// If the value to store is a vector that needs to be scalarized, it must be /// <1 x ty>. Just store the element. SDValue DAGTypeLegalizer::ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo){ @@ -637,6 +719,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::SINT_TO_FP: case ISD::TRUNCATE: case ISD::UINT_TO_FP: + case ISD::FCANONICALIZE: SplitVecRes_UnaryOp(N, Lo, Hi); break; @@ -695,7 +778,7 @@ void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo, GetSplitVector(N->getOperand(1), RHSLo, RHSHi); SDLoc dl(N); - const SDNodeFlags *Flags = N->getFlags(); + const SDNodeFlags Flags = N->getFlags(); unsigned Opcode = N->getOpcode(); Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Flags); Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Flags); @@ -781,10 +864,10 @@ void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); unsigned LoNumElts = LoVT.getVectorNumElements(); SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+LoNumElts); - Lo = DAG.getNode(ISD::BUILD_VECTOR, dl, LoVT, LoOps); + Lo = DAG.getBuildVector(LoVT, dl, LoOps); SmallVector<SDValue, 8> HiOps(N->op_begin()+LoNumElts, N->op_end()); - Hi = DAG.getNode(ISD::BUILD_VECTOR, dl, HiVT, HiOps); + Hi = DAG.getBuildVector(HiVT, dl, HiOps); } void DAGTypeLegalizer::SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, @@ -928,7 +1011,12 @@ void DAGTypeLegalizer::SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo, SDLoc dl(N); SDValue InLo, InHi; - GetSplitVector(N0, InLo, InHi); + + if (getTypeAction(N0.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(N0, InLo, InHi); + else + std::tie(InLo, InHi) = DAG.SplitVectorOperand(N, 0); + EVT InLoVT = InLo.getValueType(); unsigned InNumElements = InLoVT.getVectorNumElements(); @@ -1253,12 +1341,9 @@ void DAGTypeLegalizer::SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, if ((NumElements & 1) == 0 && SrcVT.getSizeInBits() * 2 < DestVT.getSizeInBits()) { LLVMContext &Ctx = *DAG.getContext(); - EVT NewSrcVT = EVT::getVectorVT( - Ctx, EVT::getIntegerVT( - Ctx, SrcVT.getScalarSizeInBits() * 2), - NumElements); - EVT SplitSrcVT = - EVT::getVectorVT(Ctx, SrcVT.getVectorElementType(), NumElements / 2); + EVT NewSrcVT = SrcVT.widenIntegerVectorElementType(Ctx); + EVT SplitSrcVT = SrcVT.getHalfNumVectorElementsVT(Ctx); + EVT SplitLoVT, SplitHiVT; std::tie(SplitLoVT, SplitHiVT) = DAG.GetSplitDestVTs(NewSrcVT); if (TLI.isTypeLegal(SrcVT) && !TLI.isTypeLegal(SplitSrcVT) && @@ -1372,7 +1457,7 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, } // Construct the Lo/Hi output using a BUILD_VECTOR. - Output = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, SVOps); + Output = DAG.getBuildVector(NewVT, dl, SVOps); } else if (InputUsed[0] == -1U) { // No input vectors were used! The result is undefined. Output = DAG.getUNDEF(NewVT); @@ -1466,8 +1551,31 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: case ISD::FTRUNC: + case ISD::FCANONICALIZE: Res = SplitVecOp_UnaryOp(N); break; + + case ISD::ANY_EXTEND_VECTOR_INREG: + case ISD::SIGN_EXTEND_VECTOR_INREG: + case ISD::ZERO_EXTEND_VECTOR_INREG: + Res = SplitVecOp_ExtVecInRegOp(N); + break; + + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMIN: + Res = SplitVecOp_VECREDUCE(N, OpNo); + break; } } @@ -1520,6 +1628,48 @@ SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) { return DAG.getNode(ISD::CONCAT_VECTORS, DL, Src0VT, LoSelect, HiSelect); } +SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) { + EVT ResVT = N->getValueType(0); + SDValue Lo, Hi; + SDLoc dl(N); + + SDValue VecOp = N->getOperand(OpNo); + EVT VecVT = VecOp.getValueType(); + assert(VecVT.isVector() && "Can only split reduce vector operand"); + GetSplitVector(VecOp, Lo, Hi); + EVT LoOpVT, HiOpVT; + std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(VecVT); + + bool NoNaN = N->getFlags().hasNoNaNs(); + unsigned CombineOpc = 0; + switch (N->getOpcode()) { + case ISD::VECREDUCE_FADD: CombineOpc = ISD::FADD; break; + case ISD::VECREDUCE_FMUL: CombineOpc = ISD::FMUL; break; + case ISD::VECREDUCE_ADD: CombineOpc = ISD::ADD; break; + case ISD::VECREDUCE_MUL: CombineOpc = ISD::MUL; break; + case ISD::VECREDUCE_AND: CombineOpc = ISD::AND; break; + case ISD::VECREDUCE_OR: CombineOpc = ISD::OR; break; + case ISD::VECREDUCE_XOR: CombineOpc = ISD::XOR; break; + case ISD::VECREDUCE_SMAX: CombineOpc = ISD::SMAX; break; + case ISD::VECREDUCE_SMIN: CombineOpc = ISD::SMIN; break; + case ISD::VECREDUCE_UMAX: CombineOpc = ISD::UMAX; break; + case ISD::VECREDUCE_UMIN: CombineOpc = ISD::UMIN; break; + case ISD::VECREDUCE_FMAX: + CombineOpc = NoNaN ? ISD::FMAXNUM : ISD::FMAXNAN; + break; + case ISD::VECREDUCE_FMIN: + CombineOpc = NoNaN ? ISD::FMINNUM : ISD::FMINNAN; + break; + default: + llvm_unreachable("Unexpected reduce ISD node"); + } + + // Use the appropriate scalar instruction on the split subvectors before + // reducing the now partially reduced smaller vector. + SDValue Partial = DAG.getNode(CombineOpc, dl, LoOpVT, Lo, Hi); + return DAG.getNode(N->getOpcode(), dl, ResVT, Partial); +} + SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) { // The result has a legal vector type, but the input needs splitting. EVT ResVT = N->getValueType(0); @@ -1615,7 +1765,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) { EltVT = MVT::i8; VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, VecVT.getVectorNumElements()); - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, ElementOps); + Vec = DAG.getBuildVector(VecVT, dl, ElementOps); } // Store the vector to the stack. @@ -1629,6 +1779,16 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) { MachinePointerInfo(), EltVT); } +SDValue DAGTypeLegalizer::SplitVecOp_ExtVecInRegOp(SDNode *N) { + SDValue Lo, Hi; + + // *_EXTEND_VECTOR_INREG only reference the lower half of the input, so + // splitting the result has the same effect as splitting the input operand. + SplitVecRes_ExtVecInRegOp(N, Lo, Hi); + + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), N->getValueType(0), Lo, Hi); +} + SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT, unsigned OpNo) { EVT LoVT, HiVT; @@ -1881,7 +2041,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) { } } - return DAG.getNode(ISD::BUILD_VECTOR, DL, N->getValueType(0), Elts); + return DAG.getBuildVector(N->getValueType(0), DL, Elts); } SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) { @@ -2165,7 +2325,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) { EVT WidenEltVT = WidenVT.getVectorElementType(); EVT VT = WidenVT; unsigned NumElts = VT.getVectorNumElements(); - const SDNodeFlags *Flags = N->getFlags(); + const SDNodeFlags Flags = N->getFlags(); while (!TLI.isTypeLegal(VT) && NumElts != 1) { NumElts = NumElts / 2; VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts); @@ -2313,7 +2473,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { unsigned Opcode = N->getOpcode(); unsigned InVTNumElts = InVT.getVectorNumElements(); - const SDNodeFlags *Flags = N->getFlags(); + const SDNodeFlags Flags = N->getFlags(); if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) { InOp = GetWidenedVector(N->getOperand(0)); InVT = InOp.getValueType(); @@ -2323,6 +2483,15 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { return DAG.getNode(Opcode, DL, WidenVT, InOp); return DAG.getNode(Opcode, DL, WidenVT, InOp, N->getOperand(1), Flags); } + if (WidenVT.getSizeInBits() == InVT.getSizeInBits()) { + // If both input and result vector types are of same width, extend + // operations should be done with SIGN/ZERO_EXTEND_VECTOR_INREG, which + // accepts fewer elements in the result than in the input. + if (Opcode == ISD::SIGN_EXTEND) + return DAG.getSignExtendVectorInReg(InOp, DL, WidenVT); + if (Opcode == ISD::ZERO_EXTEND) + return DAG.getZeroExtendVectorInReg(InOp, DL, WidenVT); + } } if (TLI.isTypeLegal(InWidenVT)) { @@ -2375,7 +2544,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { for (; i < WidenNumElts; ++i) Ops[i] = UndefVal; - return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops); + return DAG.getBuildVector(WidenVT, DL, Ops); } SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) { @@ -2430,7 +2599,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) { while (Ops.size() != WidenNumElts) Ops.push_back(DAG.getUNDEF(WidenSVT)); - return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops); + return DAG.getBuildVector(WidenVT, DL, Ops); } SDValue DAGTypeLegalizer::WidenVecRes_FCOPYSIGN(SDNode *N) { @@ -2568,7 +2737,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) { if (InVT.isVector()) NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewInVT, Ops); else - NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl, NewInVT, Ops); + NewVec = DAG.getBuildVector(NewInVT, dl, Ops); return DAG.getNode(ISD::BITCAST, dl, WidenVT, NewVec); } } @@ -2593,7 +2762,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) { assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!"); NewOps.append(WidenNumElts - NumElts, DAG.getUNDEF(EltVT)); - return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, NewOps); + return DAG.getBuildVector(WidenVT, dl, NewOps); } SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) { @@ -2663,7 +2832,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) { SDValue UndefVal = DAG.getUNDEF(EltVT); for (; Idx < WidenNumElts; ++Idx) Ops[Idx] = UndefVal; - return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops); + return DAG.getBuildVector(WidenVT, dl, Ops); } SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) { @@ -2704,7 +2873,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) { SDValue UndefVal = DAG.getUNDEF(EltVT); for (; i < WidenNumElts; ++i) Ops[i] = UndefVal; - return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops); + return DAG.getBuildVector(WidenVT, dl, Ops); } SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) { @@ -2814,6 +2983,213 @@ SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) { WidenVT, N->getOperand(0)); } +// Return true if this is a node that could have two SETCCs as operands. +static inline bool isLogicalMaskOp(unsigned Opcode) { + switch (Opcode) { + case ISD::AND: + case ISD::OR: + case ISD::XOR: + return true; + } + return false; +} + +// This is used just for the assert in convertMask(). Check that this either +// a SETCC or a previously handled SETCC by convertMask(). +#ifndef NDEBUG +static inline bool isSETCCorConvertedSETCC(SDValue N) { + if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR) + N = N.getOperand(0); + else if (N.getOpcode() == ISD::CONCAT_VECTORS) { + for (unsigned i = 1; i < N->getNumOperands(); ++i) + if (!N->getOperand(i)->isUndef()) + return false; + N = N.getOperand(0); + } + + if (N.getOpcode() == ISD::TRUNCATE) + N = N.getOperand(0); + else if (N.getOpcode() == ISD::SIGN_EXTEND) + N = N.getOperand(0); + + if (isLogicalMaskOp(N.getOpcode())) + return isSETCCorConvertedSETCC(N.getOperand(0)) && + isSETCCorConvertedSETCC(N.getOperand(1)); + + return (N.getOpcode() == ISD::SETCC || + ISD::isBuildVectorOfConstantSDNodes(N.getNode())); +} +#endif + +// Return a mask of vector type MaskVT to replace InMask. Also adjust MaskVT +// to ToMaskVT if needed with vector extension or truncation. +SDValue DAGTypeLegalizer::convertMask(SDValue InMask, EVT MaskVT, + EVT ToMaskVT) { + // Currently a SETCC or a AND/OR/XOR with two SETCCs are handled. + // FIXME: This code seems to be too restrictive, we might consider + // generalizing it or dropping it. + assert(isSETCCorConvertedSETCC(InMask) && "Unexpected mask argument."); + + // Make a new Mask node, with a legal result VT. + SmallVector<SDValue, 4> Ops; + for (unsigned i = 0; i < InMask->getNumOperands(); ++i) + Ops.push_back(InMask->getOperand(i)); + SDValue Mask = DAG.getNode(InMask->getOpcode(), SDLoc(InMask), MaskVT, Ops); + + // If MaskVT has smaller or bigger elements than ToMaskVT, a vector sign + // extend or truncate is needed. + LLVMContext &Ctx = *DAG.getContext(); + unsigned MaskScalarBits = MaskVT.getScalarSizeInBits(); + unsigned ToMaskScalBits = ToMaskVT.getScalarSizeInBits(); + if (MaskScalarBits < ToMaskScalBits) { + EVT ExtVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(), + MaskVT.getVectorNumElements()); + Mask = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Mask), ExtVT, Mask); + } else if (MaskScalarBits > ToMaskScalBits) { + EVT TruncVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(), + MaskVT.getVectorNumElements()); + Mask = DAG.getNode(ISD::TRUNCATE, SDLoc(Mask), TruncVT, Mask); + } + + assert(Mask->getValueType(0).getScalarSizeInBits() == + ToMaskVT.getScalarSizeInBits() && + "Mask should have the right element size by now."); + + // Adjust Mask to the right number of elements. + unsigned CurrMaskNumEls = Mask->getValueType(0).getVectorNumElements(); + if (CurrMaskNumEls > ToMaskVT.getVectorNumElements()) { + MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); + SDValue ZeroIdx = DAG.getConstant(0, SDLoc(Mask), IdxTy); + Mask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Mask), ToMaskVT, Mask, + ZeroIdx); + } else if (CurrMaskNumEls < ToMaskVT.getVectorNumElements()) { + unsigned NumSubVecs = (ToMaskVT.getVectorNumElements() / CurrMaskNumEls); + EVT SubVT = Mask->getValueType(0); + SmallVector<SDValue, 16> SubConcatOps(NumSubVecs); + SubConcatOps[0] = Mask; + for (unsigned i = 1; i < NumSubVecs; ++i) + SubConcatOps[i] = DAG.getUNDEF(SubVT); + Mask = + DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Mask), ToMaskVT, SubConcatOps); + } + + assert((Mask->getValueType(0) == ToMaskVT) && + "A mask of ToMaskVT should have been produced by now."); + + return Mask; +} + +// Get the target mask VT, and widen if needed. +EVT DAGTypeLegalizer::getSETCCWidenedResultTy(SDValue SetCC) { + assert(SetCC->getOpcode() == ISD::SETCC); + LLVMContext &Ctx = *DAG.getContext(); + EVT MaskVT = getSetCCResultType(SetCC->getOperand(0).getValueType()); + if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector) + MaskVT = TLI.getTypeToTransformTo(Ctx, MaskVT); + return MaskVT; +} + +// This method tries to handle VSELECT and its mask by legalizing operands +// (which may require widening) and if needed adjusting the mask vector type +// to match that of the VSELECT. Without it, many cases end up with +// scalarization of the SETCC, with many unnecessary instructions. +SDValue DAGTypeLegalizer::WidenVSELECTAndMask(SDNode *N) { + LLVMContext &Ctx = *DAG.getContext(); + SDValue Cond = N->getOperand(0); + + if (N->getOpcode() != ISD::VSELECT) + return SDValue(); + + if (Cond->getOpcode() != ISD::SETCC && !isLogicalMaskOp(Cond->getOpcode())) + return SDValue(); + + // If this is a splitted VSELECT that was previously already handled, do + // nothing. + if (Cond->getValueType(0).getScalarSizeInBits() != 1) + return SDValue(); + + EVT VSelVT = N->getValueType(0); + // Only handle vector types which are a power of 2. + if (!isPowerOf2_64(VSelVT.getSizeInBits())) + return SDValue(); + + // Don't touch if this will be scalarized. + EVT FinalVT = VSelVT; + while (getTypeAction(FinalVT) == TargetLowering::TypeSplitVector) + FinalVT = FinalVT.getHalfNumVectorElementsVT(Ctx); + + if (FinalVT.getVectorNumElements() == 1) + return SDValue(); + + // If there is support for an i1 vector mask, don't touch. + if (Cond.getOpcode() == ISD::SETCC) { + EVT SetCCOpVT = Cond->getOperand(0).getValueType(); + while (TLI.getTypeAction(Ctx, SetCCOpVT) != TargetLowering::TypeLegal) + SetCCOpVT = TLI.getTypeToTransformTo(Ctx, SetCCOpVT); + EVT SetCCResVT = getSetCCResultType(SetCCOpVT); + if (SetCCResVT.getScalarSizeInBits() == 1) + return SDValue(); + } + + // Get the VT and operands for VSELECT, and widen if needed. + SDValue VSelOp1 = N->getOperand(1); + SDValue VSelOp2 = N->getOperand(2); + if (getTypeAction(VSelVT) == TargetLowering::TypeWidenVector) { + VSelVT = TLI.getTypeToTransformTo(Ctx, VSelVT); + VSelOp1 = GetWidenedVector(VSelOp1); + VSelOp2 = GetWidenedVector(VSelOp2); + } + + // The mask of the VSELECT should have integer elements. + EVT ToMaskVT = VSelVT; + if (!ToMaskVT.getScalarType().isInteger()) + ToMaskVT = ToMaskVT.changeVectorElementTypeToInteger(); + + SDValue Mask; + if (Cond->getOpcode() == ISD::SETCC) { + EVT MaskVT = getSETCCWidenedResultTy(Cond); + Mask = convertMask(Cond, MaskVT, ToMaskVT); + } else if (isLogicalMaskOp(Cond->getOpcode()) && + Cond->getOperand(0).getOpcode() == ISD::SETCC && + Cond->getOperand(1).getOpcode() == ISD::SETCC) { + // Cond is (AND/OR/XOR (SETCC, SETCC)) + SDValue SETCC0 = Cond->getOperand(0); + SDValue SETCC1 = Cond->getOperand(1); + EVT VT0 = getSETCCWidenedResultTy(SETCC0); + EVT VT1 = getSETCCWidenedResultTy(SETCC1); + unsigned ScalarBits0 = VT0.getScalarSizeInBits(); + unsigned ScalarBits1 = VT1.getScalarSizeInBits(); + unsigned ScalarBits_ToMask = ToMaskVT.getScalarSizeInBits(); + EVT MaskVT; + // If the two SETCCs have different VTs, either extend/truncate one of + // them to the other "towards" ToMaskVT, or truncate one and extend the + // other to ToMaskVT. + if (ScalarBits0 != ScalarBits1) { + EVT NarrowVT = ((ScalarBits0 < ScalarBits1) ? VT0 : VT1); + EVT WideVT = ((NarrowVT == VT0) ? VT1 : VT0); + if (ScalarBits_ToMask >= WideVT.getScalarSizeInBits()) + MaskVT = WideVT; + else if (ScalarBits_ToMask <= NarrowVT.getScalarSizeInBits()) + MaskVT = NarrowVT; + else + MaskVT = ToMaskVT; + } else + // If the two SETCCs have the same VT, don't change it. + MaskVT = VT0; + + // Make new SETCCs and logical nodes. + SETCC0 = convertMask(SETCC0, VT0, MaskVT); + SETCC1 = convertMask(SETCC1, VT1, MaskVT); + Cond = DAG.getNode(Cond->getOpcode(), SDLoc(Cond), MaskVT, SETCC0, SETCC1); + + // Convert the logical op for VSELECT if needed. + Mask = convertMask(Cond, MaskVT, ToMaskVT); + } else + return SDValue(); + + return DAG.getNode(ISD::VSELECT, SDLoc(N), VSelVT, Mask, VSelOp1, VSelOp2); +} + SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); unsigned WidenNumElts = WidenVT.getVectorNumElements(); @@ -2821,6 +3197,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) { SDValue Cond1 = N->getOperand(0); EVT CondVT = Cond1.getValueType(); if (CondVT.isVector()) { + if (SDValue Res = WidenVSELECTAndMask(N)) + return Res; + EVT CondEltVT = CondVT.getVectorElementType(); EVT CondWidenVT = EVT::getVectorVT(*DAG.getContext(), CondEltVT, WidenNumElts); @@ -3093,7 +3472,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) { ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp, DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())))); - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + return DAG.getBuildVector(VT, dl, Ops); } SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) { @@ -3144,7 +3523,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) { ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp, DAG.getConstant(j, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); } - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + return DAG.getBuildVector(VT, dl, Ops); } SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N) { @@ -3565,10 +3944,9 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain, for (; i != WidenNumElts; ++i) Ops[i] = UndefVal; - return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops); + return DAG.getBuildVector(WidenVT, dl, Ops); } - void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain, StoreSDNode *ST) { // The strategy assumes that we can efficiently store power-of-two widths. @@ -3737,5 +4115,5 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT, DAG.getUNDEF(EltVT); for ( ; Idx < WidenNumElts; ++Idx) Ops[Idx] = FillVal; - return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops); + return DAG.getBuildVector(NVT, dl, Ops); } diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp index ded8e68..a21b4c7 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp @@ -57,10 +57,8 @@ ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS) RegPressure.resize(NumRC); std::fill(RegLimit.begin(), RegLimit.end(), 0); std::fill(RegPressure.begin(), RegPressure.end(), 0); - for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(), - E = TRI->regclass_end(); - I != E; ++I) - RegLimit[(*I)->getID()] = TRI->getRegPressureLimit(*I, *IS->MF); + for (const TargetRegisterClass *RC : TRI->regclasses()) + RegLimit[RC->getID()] = TRI->getRegPressureLimit(RC, *IS->MF); ParallelLiveRanges = 0; HorizontalVerticalBalance = 0; @@ -69,12 +67,11 @@ ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS) unsigned ResourcePriorityQueue::numberRCValPredInSU(SUnit *SU, unsigned RCId) { unsigned NumberDeps = 0; - for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - if (I->isCtrl()) + for (SDep &Pred : SU->Preds) { + if (Pred.isCtrl()) continue; - SUnit *PredSU = I->getSUnit(); + SUnit *PredSU = Pred.getSUnit(); const SDNode *ScegN = PredSU->getNode(); if (!ScegN) @@ -107,12 +104,11 @@ ResourcePriorityQueue::numberRCValPredInSU(SUnit *SU, unsigned RCId) { unsigned ResourcePriorityQueue::numberRCValSuccInSU(SUnit *SU, unsigned RCId) { unsigned NumberDeps = 0; - for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); - I != E; ++I) { - if (I->isCtrl()) + for (const SDep &Succ : SU->Succs) { + if (Succ.isCtrl()) continue; - SUnit *SuccSU = I->getSUnit(); + SUnit *SuccSU = Succ.getSUnit(); const SDNode *ScegN = SuccSU->getNode(); if (!ScegN) continue; @@ -144,9 +140,8 @@ unsigned ResourcePriorityQueue::numberRCValSuccInSU(SUnit *SU, static unsigned numberCtrlDepsInSU(SUnit *SU) { unsigned NumberDeps = 0; - for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); - I != E; ++I) - if (I->isCtrl()) + for (const SDep &Succ : SU->Succs) + if (Succ.isCtrl()) NumberDeps++; return NumberDeps; @@ -154,9 +149,8 @@ static unsigned numberCtrlDepsInSU(SUnit *SU) { static unsigned numberCtrlPredInSU(SUnit *SU) { unsigned NumberDeps = 0; - for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) - if (I->isCtrl()) + for (SDep &Pred : SU->Preds) + if (Pred.isCtrl()) NumberDeps++; return NumberDeps; @@ -214,15 +208,14 @@ bool resource_sort::operator()(const SUnit *LHS, const SUnit *RHS) const { /// of SU, return it, otherwise return null. SUnit *ResourcePriorityQueue::getSingleUnscheduledPred(SUnit *SU) { SUnit *OnlyAvailablePred = nullptr; - for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - SUnit &Pred = *I->getSUnit(); - if (!Pred.isScheduled) { + for (const SDep &Pred : SU->Preds) { + SUnit &PredSU = *Pred.getSUnit(); + if (!PredSU.isScheduled) { // We found an available, but not scheduled, predecessor. If it's the // only one we have found, keep track of it... otherwise give up. - if (OnlyAvailablePred && OnlyAvailablePred != &Pred) + if (OnlyAvailablePred && OnlyAvailablePred != &PredSU) return nullptr; - OnlyAvailablePred = &Pred; + OnlyAvailablePred = &PredSU; } } return OnlyAvailablePred; @@ -232,9 +225,8 @@ void ResourcePriorityQueue::push(SUnit *SU) { // Look at all of the successors of this node. Count the number of nodes that // this node is the sole unscheduled node for. unsigned NumNodesBlocking = 0; - for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); - I != E; ++I) - if (getSingleUnscheduledPred(I->getSUnit()) == SU) + for (const SDep &Succ : SU->Succs) + if (getSingleUnscheduledPred(Succ.getSUnit()) == SU) ++NumNodesBlocking; NumNodesSolelyBlocking[SU->NodeNum] = NumNodesBlocking; @@ -271,14 +263,13 @@ bool ResourcePriorityQueue::isResourceAvailable(SUnit *SU) { // Now see if there are no other dependencies // to instructions already in the packet. for (unsigned i = 0, e = Packet.size(); i != e; ++i) - for (SUnit::const_succ_iterator I = Packet[i]->Succs.begin(), - E = Packet[i]->Succs.end(); I != E; ++I) { + for (const SDep &Succ : Packet[i]->Succs) { // Since we do not add pseudos to packets, might as well // ignore order deps. - if (I->isCtrl()) + if (Succ.isCtrl()) continue; - if (I->getSUnit() == SU) + if (Succ.getSUnit() == SU) return false; } @@ -364,16 +355,11 @@ int ResourcePriorityQueue::regPressureDelta(SUnit *SU, bool RawPressure) { return RegBalance; if (RawPressure) { - for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(), - E = TRI->regclass_end(); I != E; ++I) { - const TargetRegisterClass *RC = *I; + for (const TargetRegisterClass *RC : TRI->regclasses()) RegBalance += rawRegPressureDelta(SU, RC->getID()); - } } else { - for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(), - E = TRI->regclass_end(); I != E; ++I) { - const TargetRegisterClass *RC = *I; + for (const TargetRegisterClass *RC : TRI->regclasses()) { if ((RegPressure[RC->getID()] + rawRegPressureDelta(SU, RC->getID()) > 0) && (RegPressure[RC->getID()] + @@ -506,11 +492,10 @@ void ResourcePriorityQueue::scheduledNode(SUnit *SU) { } } } - for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - if (I->isCtrl() || (I->getSUnit()->NumRegDefsLeft == 0)) + for (SDep &Pred : SU->Preds) { + if (Pred.isCtrl() || (Pred.getSUnit()->NumRegDefsLeft == 0)) continue; - --I->getSUnit()->NumRegDefsLeft; + --Pred.getSUnit()->NumRegDefsLeft; } } @@ -522,10 +507,9 @@ void ResourcePriorityQueue::scheduledNode(SUnit *SU) { // number of live ranges. All others, increase it. unsigned NumberNonControlDeps = 0; - for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); - I != E; ++I) { - adjustPriorityOfUnscheduledPreds(I->getSUnit()); - if (!I->isCtrl()) + for (const SDep &Succ : SU->Succs) { + adjustPriorityOfUnscheduledPreds(Succ.getSUnit()); + if (!Succ.isCtrl()) NumberNonControlDeps++; } @@ -602,8 +586,7 @@ SUnit *ResourcePriorityQueue::pop() { std::vector<SUnit *>::iterator Best = Queue.begin(); if (!DisableDFASched) { int BestCost = SUSchedulingCost(*Best); - for (std::vector<SUnit *>::iterator I = std::next(Queue.begin()), - E = Queue.end(); I != E; ++I) { + for (auto I = std::next(Queue.begin()), E = Queue.end(); I != E; ++I) { if (SUSchedulingCost(*I) > BestCost) { BestCost = SUSchedulingCost(*I); @@ -613,8 +596,7 @@ SUnit *ResourcePriorityQueue::pop() { } // Use default TD scheduling mechanism. else { - for (std::vector<SUnit *>::iterator I = std::next(Queue.begin()), - E = Queue.end(); I != E; ++I) + for (auto I = std::next(Queue.begin()), E = Queue.end(); I != E; ++I) if (Picker(*Best, *I)) Best = I; } diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp index 62e7733..1379940 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp @@ -11,12 +11,12 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/SchedulerRegistry.h" #include "InstrEmitter.h" #include "ScheduleDAGSDNodes.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/SchedulerRegistry.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/InlineAsm.h" @@ -160,18 +160,17 @@ void ScheduleDAGFast::ReleasePred(SUnit *SU, SDep *PredEdge) { void ScheduleDAGFast::ReleasePredecessors(SUnit *SU, unsigned CurCycle) { // Bottom up: release predecessors - for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - ReleasePred(SU, &*I); - if (I->isAssignedRegDep()) { + for (SDep &Pred : SU->Preds) { + ReleasePred(SU, &Pred); + if (Pred.isAssignedRegDep()) { // This is a physical register dependency and it's impossible or // expensive to copy the register. Make sure nothing that can // clobber the register is scheduled between the predecessor and // this node. - if (!LiveRegDefs[I->getReg()]) { + if (!LiveRegDefs[Pred.getReg()]) { ++NumLiveRegs; - LiveRegDefs[I->getReg()] = I->getSUnit(); - LiveRegCycles[I->getReg()] = CurCycle; + LiveRegDefs[Pred.getReg()] = Pred.getSUnit(); + LiveRegCycles[Pred.getReg()] = CurCycle; } } } @@ -191,16 +190,15 @@ void ScheduleDAGFast::ScheduleNodeBottomUp(SUnit *SU, unsigned CurCycle) { ReleasePredecessors(SU, CurCycle); // Release all the implicit physical register defs that are live. - for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); - I != E; ++I) { - if (I->isAssignedRegDep()) { - if (LiveRegCycles[I->getReg()] == I->getSUnit()->getHeight()) { + for (SDep &Succ : SU->Succs) { + if (Succ.isAssignedRegDep()) { + if (LiveRegCycles[Succ.getReg()] == Succ.getSUnit()->getHeight()) { assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!"); - assert(LiveRegDefs[I->getReg()] == SU && + assert(LiveRegDefs[Succ.getReg()] == SU && "Physical register dependency violated?"); --NumLiveRegs; - LiveRegDefs[I->getReg()] = nullptr; - LiveRegCycles[I->getReg()] = 0; + LiveRegDefs[Succ.getReg()] = nullptr; + LiveRegCycles[Succ.getReg()] = 0; } } } @@ -282,22 +280,20 @@ SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) { SmallVector<SDep, 4> LoadPreds; SmallVector<SDep, 4> NodePreds; SmallVector<SDep, 4> NodeSuccs; - for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - if (I->isCtrl()) - ChainPred = *I; - else if (I->getSUnit()->getNode() && - I->getSUnit()->getNode()->isOperandOf(LoadNode)) - LoadPreds.push_back(*I); + for (SDep &Pred : SU->Preds) { + if (Pred.isCtrl()) + ChainPred = Pred; + else if (Pred.getSUnit()->getNode() && + Pred.getSUnit()->getNode()->isOperandOf(LoadNode)) + LoadPreds.push_back(Pred); else - NodePreds.push_back(*I); + NodePreds.push_back(Pred); } - for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); - I != E; ++I) { - if (I->isCtrl()) - ChainSuccs.push_back(*I); + for (SDep &Succ : SU->Succs) { + if (Succ.isCtrl()) + ChainSuccs.push_back(Succ); else - NodeSuccs.push_back(*I); + NodeSuccs.push_back(Succ); } if (ChainPred.getSUnit()) { @@ -354,21 +350,19 @@ SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) { NewSU = Clone(SU); // New SUnit has the exact same predecessors. - for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) - if (!I->isArtificial()) - AddPred(NewSU, *I); + for (SDep &Pred : SU->Preds) + if (!Pred.isArtificial()) + AddPred(NewSU, Pred); // Only copy scheduled successors. Cut them from old node's successor // list and move them over. SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps; - for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); - I != E; ++I) { - if (I->isArtificial()) + for (SDep &Succ : SU->Succs) { + if (Succ.isArtificial()) continue; - SUnit *SuccSU = I->getSUnit(); + SUnit *SuccSU = Succ.getSUnit(); if (SuccSU->isScheduled) { - SDep D = *I; + SDep D = Succ; D.setSUnit(NewSU); AddPred(SuccSU, D); D.setSUnit(SU); @@ -399,16 +393,15 @@ void ScheduleDAGFast::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg, // Only copy scheduled successors. Cut them from old node's successor // list and move them over. SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps; - for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); - I != E; ++I) { - if (I->isArtificial()) + for (SDep &Succ : SU->Succs) { + if (Succ.isArtificial()) continue; - SUnit *SuccSU = I->getSUnit(); + SUnit *SuccSU = Succ.getSUnit(); if (SuccSU->isScheduled) { - SDep D = *I; + SDep D = Succ; D.setSUnit(CopyToSU); AddPred(SuccSU, D); - DelDeps.push_back(std::make_pair(SuccSU, *I)); + DelDeps.push_back(std::make_pair(SuccSU, Succ)); } } for (unsigned i = 0, e = DelDeps.size(); i != e; ++i) { @@ -479,10 +472,9 @@ bool ScheduleDAGFast::DelayForLiveRegsBottomUp(SUnit *SU, SmallSet<unsigned, 4> RegAdded; // If this node would clobber any "live" register, then it's not ready. - for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - if (I->isAssignedRegDep()) { - CheckForLiveRegDef(I->getSUnit(), I->getReg(), LiveRegDefs, + for (SDep &Pred : SU->Preds) { + if (Pred.isAssignedRegDep()) { + CheckForLiveRegDef(Pred.getSUnit(), Pred.getReg(), LiveRegDefs, RegAdded, LRegs, TRI); } } @@ -755,9 +747,8 @@ void ScheduleDAGLinearize::Schedule() { // Glue user must be scheduled together with the glue operand. So other // users of the glue operand must be treated as its users. SDNode *ImmGUser = Glue->getGluedUser(); - for (SDNode::use_iterator ui = Glue->use_begin(), ue = Glue->use_end(); - ui != ue; ++ui) - if (*ui == ImmGUser) + for (const SDNode *U : Glue->uses()) + if (U == ImmGUser) --Degree; GUser->setNodeId(UDegree + Degree); Glue->setNodeId(1); diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index 3549ccd..70b1fa7 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -15,13 +15,13 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/SchedulerRegistry.h" #include "ScheduleDAGSDNodes.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include "llvm/CodeGen/SchedulerRegistry.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/InlineAsm.h" @@ -226,6 +226,7 @@ private: void UnscheduleNodeBottomUp(SUnit*); void RestoreHazardCheckerBottomUp(); void BacktrackBottomUp(SUnit*, SUnit*); + SUnit *TryUnfoldSU(SUnit *); SUnit *CopyAndMoveSuccessors(SUnit*); void InsertCopiesAndMoveSuccs(SUnit*, unsigned, const TargetRegisterClass*, @@ -422,11 +423,9 @@ static bool IsChainDependent(SDNode *Outer, SDNode *Inner, } // Check for a lowered CALLSEQ_BEGIN or CALLSEQ_END. if (N->isMachineOpcode()) { - if (N->getMachineOpcode() == - (unsigned)TII->getCallFrameDestroyOpcode()) { + if (N->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) { ++NestLevel; - } else if (N->getMachineOpcode() == - (unsigned)TII->getCallFrameSetupOpcode()) { + } else if (N->getMachineOpcode() == TII->getCallFrameSetupOpcode()) { if (NestLevel == 0) return false; --NestLevel; @@ -480,12 +479,10 @@ FindCallSeqStart(SDNode *N, unsigned &NestLevel, unsigned &MaxNest, } // Check for a lowered CALLSEQ_BEGIN or CALLSEQ_END. if (N->isMachineOpcode()) { - if (N->getMachineOpcode() == - (unsigned)TII->getCallFrameDestroyOpcode()) { + if (N->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) { ++NestLevel; MaxNest = std::max(MaxNest, NestLevel); - } else if (N->getMachineOpcode() == - (unsigned)TII->getCallFrameSetupOpcode()) { + } else if (N->getMachineOpcode() == TII->getCallFrameSetupOpcode()) { assert(NestLevel != 0); --NestLevel; if (NestLevel == 0) @@ -524,21 +521,20 @@ FindCallSeqStart(SDNode *N, unsigned &NestLevel, unsigned &MaxNest, /// interference on flags. void ScheduleDAGRRList::ReleasePredecessors(SUnit *SU) { // Bottom up: release predecessors - for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - ReleasePred(SU, &*I); - if (I->isAssignedRegDep()) { + for (SDep &Pred : SU->Preds) { + ReleasePred(SU, &Pred); + if (Pred.isAssignedRegDep()) { // This is a physical register dependency and it's impossible or // expensive to copy the register. Make sure nothing that can // clobber the register is scheduled between the predecessor and // this node. - SUnit *RegDef = LiveRegDefs[I->getReg()]; (void)RegDef; - assert((!RegDef || RegDef == SU || RegDef == I->getSUnit()) && + SUnit *RegDef = LiveRegDefs[Pred.getReg()]; (void)RegDef; + assert((!RegDef || RegDef == SU || RegDef == Pred.getSUnit()) && "interference on register dependence"); - LiveRegDefs[I->getReg()] = I->getSUnit(); - if (!LiveRegGens[I->getReg()]) { + LiveRegDefs[Pred.getReg()] = Pred.getSUnit(); + if (!LiveRegGens[Pred.getReg()]) { ++NumLiveRegs; - LiveRegGens[I->getReg()] = SU; + LiveRegGens[Pred.getReg()] = SU; } } } @@ -550,7 +546,7 @@ void ScheduleDAGRRList::ReleasePredecessors(SUnit *SU) { if (!LiveRegDefs[CallResource]) for (SDNode *Node = SU->getNode(); Node; Node = Node->getGluedNode()) if (Node->isMachineOpcode() && - Node->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) { + Node->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) { unsigned NestLevel = 0; unsigned MaxNest = 0; SDNode *N = FindCallSeqStart(Node, NestLevel, MaxNest, TII); @@ -737,15 +733,14 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) { ReleasePredecessors(SU); // Release all the implicit physical register defs that are live. - for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); - I != E; ++I) { - // LiveRegDegs[I->getReg()] != SU when SU is a two-address node. - if (I->isAssignedRegDep() && LiveRegDefs[I->getReg()] == SU) { + for (SDep &Succ : SU->Succs) { + // LiveRegDegs[Succ.getReg()] != SU when SU is a two-address node. + if (Succ.isAssignedRegDep() && LiveRegDefs[Succ.getReg()] == SU) { assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!"); --NumLiveRegs; - LiveRegDefs[I->getReg()] = nullptr; - LiveRegGens[I->getReg()] = nullptr; - releaseInterferences(I->getReg()); + LiveRegDefs[Succ.getReg()] = nullptr; + LiveRegGens[Succ.getReg()] = nullptr; + releaseInterferences(Succ.getReg()); } } // Release the special call resource dependence, if this is the beginning @@ -755,7 +750,7 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) { for (const SDNode *SUNode = SU->getNode(); SUNode; SUNode = SUNode->getGluedNode()) { if (SUNode->isMachineOpcode() && - SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameSetupOpcode()) { + SUNode->getMachineOpcode() == TII->getCallFrameSetupOpcode()) { assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!"); --NumLiveRegs; LiveRegDefs[CallResource] = nullptr; @@ -786,7 +781,7 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) { } /// CapturePred - This does the opposite of ReleasePred. Since SU is being -/// unscheduled, incrcease the succ left count of its predecessors. Remove +/// unscheduled, increase the succ left count of its predecessors. Remove /// them from AvailableQueue if necessary. void ScheduleDAGRRList::CapturePred(SDep *PredEdge) { SUnit *PredSU = PredEdge->getSUnit(); @@ -806,17 +801,16 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) { DEBUG(dbgs() << "*** Unscheduling [" << SU->getHeight() << "]: "); DEBUG(SU->dump(this)); - for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - CapturePred(&*I); - if (I->isAssignedRegDep() && SU == LiveRegGens[I->getReg()]){ + for (SDep &Pred : SU->Preds) { + CapturePred(&Pred); + if (Pred.isAssignedRegDep() && SU == LiveRegGens[Pred.getReg()]){ assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!"); - assert(LiveRegDefs[I->getReg()] == I->getSUnit() && + assert(LiveRegDefs[Pred.getReg()] == Pred.getSUnit() && "Physical register dependency violated?"); --NumLiveRegs; - LiveRegDefs[I->getReg()] = nullptr; - LiveRegGens[I->getReg()] = nullptr; - releaseInterferences(I->getReg()); + LiveRegDefs[Pred.getReg()] = nullptr; + LiveRegGens[Pred.getReg()] = nullptr; + releaseInterferences(Pred.getReg()); } } @@ -826,7 +820,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) { for (const SDNode *SUNode = SU->getNode(); SUNode; SUNode = SUNode->getGluedNode()) { if (SUNode->isMachineOpcode() && - SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameSetupOpcode()) { + SUNode->getMachineOpcode() == TII->getCallFrameSetupOpcode()) { ++NumLiveRegs; LiveRegDefs[CallResource] = SU; LiveRegGens[CallResource] = CallSeqEndForStart[SU]; @@ -839,7 +833,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) { for (const SDNode *SUNode = SU->getNode(); SUNode; SUNode = SUNode->getGluedNode()) { if (SUNode->isMachineOpcode() && - SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) { + SUNode->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) { assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!"); --NumLiveRegs; LiveRegDefs[CallResource] = nullptr; @@ -899,7 +893,7 @@ void ScheduleDAGRRList::RestoreHazardCheckerBottomUp() { std::vector<SUnit*>::const_iterator I = (Sequence.end() - LookAhead); unsigned HazardCycle = (*I)->getHeight(); - for (std::vector<SUnit*>::const_iterator E = Sequence.end(); I != E; ++I) { + for (auto E = Sequence.end(); I != E; ++I) { SUnit *SU = *I; for (; SU->getHeight() > HazardCycle; ++HazardCycle) { HazardRec->RecedeCycle(); @@ -941,6 +935,146 @@ static bool isOperandOf(const SUnit *SU, SDNode *N) { return false; } +/// TryUnfold - Attempt to unfold +SUnit *ScheduleDAGRRList::TryUnfoldSU(SUnit *SU) { + SDNode *N = SU->getNode(); + // Use while over if to ease fall through. + SmallVector<SDNode *, 2> NewNodes; + if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes)) + return nullptr; + + // unfolding an x86 DEC64m operation results in store, dec, load which + // can't be handled here so quit + if (NewNodes.size() == 3) + return nullptr; + + assert(NewNodes.size() == 2 && "Expected a load folding node!"); + + N = NewNodes[1]; + SDNode *LoadNode = NewNodes[0]; + unsigned NumVals = N->getNumValues(); + unsigned OldNumVals = SU->getNode()->getNumValues(); + + // LoadNode may already exist. This can happen when there is another + // load from the same location and producing the same type of value + // but it has different alignment or volatileness. + bool isNewLoad = true; + SUnit *LoadSU; + if (LoadNode->getNodeId() != -1) { + LoadSU = &SUnits[LoadNode->getNodeId()]; + // If LoadSU has already been scheduled, we should clone it but + // this would negate the benefit to unfolding so just return SU. + if (LoadSU->isScheduled) + return SU; + isNewLoad = false; + } else { + LoadSU = CreateNewSUnit(LoadNode); + LoadNode->setNodeId(LoadSU->NodeNum); + + InitNumRegDefsLeft(LoadSU); + computeLatency(LoadSU); + } + + DEBUG(dbgs() << "Unfolding SU #" << SU->NodeNum << "\n"); + + // Now that we are committed to unfolding replace DAG Uses. + for (unsigned i = 0; i != NumVals; ++i) + DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), i), SDValue(N, i)); + DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), OldNumVals - 1), + SDValue(LoadNode, 1)); + + SUnit *NewSU = CreateNewSUnit(N); + assert(N->getNodeId() == -1 && "Node already inserted!"); + N->setNodeId(NewSU->NodeNum); + + const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); + for (unsigned i = 0; i != MCID.getNumOperands(); ++i) { + if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) { + NewSU->isTwoAddress = true; + break; + } + } + if (MCID.isCommutable()) + NewSU->isCommutable = true; + + InitNumRegDefsLeft(NewSU); + computeLatency(NewSU); + + // Record all the edges to and from the old SU, by category. + SmallVector<SDep, 4> ChainPreds; + SmallVector<SDep, 4> ChainSuccs; + SmallVector<SDep, 4> LoadPreds; + SmallVector<SDep, 4> NodePreds; + SmallVector<SDep, 4> NodeSuccs; + for (SDep &Pred : SU->Preds) { + if (Pred.isCtrl()) + ChainPreds.push_back(Pred); + else if (isOperandOf(Pred.getSUnit(), LoadNode)) + LoadPreds.push_back(Pred); + else + NodePreds.push_back(Pred); + } + for (SDep &Succ : SU->Succs) { + if (Succ.isCtrl()) + ChainSuccs.push_back(Succ); + else + NodeSuccs.push_back(Succ); + } + + // Now assign edges to the newly-created nodes. + for (const SDep &Pred : ChainPreds) { + RemovePred(SU, Pred); + if (isNewLoad) + AddPred(LoadSU, Pred); + } + for (const SDep &Pred : LoadPreds) { + RemovePred(SU, Pred); + if (isNewLoad) + AddPred(LoadSU, Pred); + } + for (const SDep &Pred : NodePreds) { + RemovePred(SU, Pred); + AddPred(NewSU, Pred); + } + for (SDep D : NodeSuccs) { + SUnit *SuccDep = D.getSUnit(); + D.setSUnit(SU); + RemovePred(SuccDep, D); + D.setSUnit(NewSU); + AddPred(SuccDep, D); + // Balance register pressure. + if (AvailableQueue->tracksRegPressure() && SuccDep->isScheduled && + !D.isCtrl() && NewSU->NumRegDefsLeft > 0) + --NewSU->NumRegDefsLeft; + } + for (SDep D : ChainSuccs) { + SUnit *SuccDep = D.getSUnit(); + D.setSUnit(SU); + RemovePred(SuccDep, D); + if (isNewLoad) { + D.setSUnit(LoadSU); + AddPred(SuccDep, D); + } + } + + // Add a data dependency to reflect that NewSU reads the value defined + // by LoadSU. + SDep D(LoadSU, SDep::Data, 0); + D.setLatency(LoadSU->Latency); + AddPred(NewSU, D); + + if (isNewLoad) + AvailableQueue->addNode(LoadSU); + AvailableQueue->addNode(NewSU); + + ++NumUnfolds; + + if (NewSU->NumSuccsLeft == 0) + NewSU->isAvailable = true; + + return NewSU; +} + /// CopyAndMoveSuccessors - Clone the specified node and move its scheduled /// successors to the newly created node. SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) { @@ -966,135 +1100,16 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) { return nullptr; } + // If possible unfold instruction. if (TryUnfold) { - SmallVector<SDNode*, 2> NewNodes; - if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes)) + SUnit *UnfoldSU = TryUnfoldSU(SU); + if (!UnfoldSU) return nullptr; - - // unfolding an x86 DEC64m operation results in store, dec, load which - // can't be handled here so quit - if (NewNodes.size() == 3) - return nullptr; - - DEBUG(dbgs() << "Unfolding SU #" << SU->NodeNum << "\n"); - assert(NewNodes.size() == 2 && "Expected a load folding node!"); - - N = NewNodes[1]; - SDNode *LoadNode = NewNodes[0]; - unsigned NumVals = N->getNumValues(); - unsigned OldNumVals = SU->getNode()->getNumValues(); - for (unsigned i = 0; i != NumVals; ++i) - DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), i), SDValue(N, i)); - DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), OldNumVals-1), - SDValue(LoadNode, 1)); - - // LoadNode may already exist. This can happen when there is another - // load from the same location and producing the same type of value - // but it has different alignment or volatileness. - bool isNewLoad = true; - SUnit *LoadSU; - if (LoadNode->getNodeId() != -1) { - LoadSU = &SUnits[LoadNode->getNodeId()]; - isNewLoad = false; - } else { - LoadSU = CreateNewSUnit(LoadNode); - LoadNode->setNodeId(LoadSU->NodeNum); - - InitNumRegDefsLeft(LoadSU); - computeLatency(LoadSU); - } - - SUnit *NewSU = CreateNewSUnit(N); - assert(N->getNodeId() == -1 && "Node already inserted!"); - N->setNodeId(NewSU->NodeNum); - - const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); - for (unsigned i = 0; i != MCID.getNumOperands(); ++i) { - if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) { - NewSU->isTwoAddress = true; - break; - } - } - if (MCID.isCommutable()) - NewSU->isCommutable = true; - - InitNumRegDefsLeft(NewSU); - computeLatency(NewSU); - - // Record all the edges to and from the old SU, by category. - SmallVector<SDep, 4> ChainPreds; - SmallVector<SDep, 4> ChainSuccs; - SmallVector<SDep, 4> LoadPreds; - SmallVector<SDep, 4> NodePreds; - SmallVector<SDep, 4> NodeSuccs; - for (SDep &Pred : SU->Preds) { - if (Pred.isCtrl()) - ChainPreds.push_back(Pred); - else if (isOperandOf(Pred.getSUnit(), LoadNode)) - LoadPreds.push_back(Pred); - else - NodePreds.push_back(Pred); - } - for (SDep &Succ : SU->Succs) { - if (Succ.isCtrl()) - ChainSuccs.push_back(Succ); - else - NodeSuccs.push_back(Succ); - } - - // Now assign edges to the newly-created nodes. - for (const SDep &Pred : ChainPreds) { - RemovePred(SU, Pred); - if (isNewLoad) - AddPred(LoadSU, Pred); - } - for (const SDep &Pred : LoadPreds) { - RemovePred(SU, Pred); - if (isNewLoad) - AddPred(LoadSU, Pred); - } - for (const SDep &Pred : NodePreds) { - RemovePred(SU, Pred); - AddPred(NewSU, Pred); - } - for (SDep D : NodeSuccs) { - SUnit *SuccDep = D.getSUnit(); - D.setSUnit(SU); - RemovePred(SuccDep, D); - D.setSUnit(NewSU); - AddPred(SuccDep, D); - // Balance register pressure. - if (AvailableQueue->tracksRegPressure() && SuccDep->isScheduled - && !D.isCtrl() && NewSU->NumRegDefsLeft > 0) - --NewSU->NumRegDefsLeft; - } - for (SDep D : ChainSuccs) { - SUnit *SuccDep = D.getSUnit(); - D.setSUnit(SU); - RemovePred(SuccDep, D); - if (isNewLoad) { - D.setSUnit(LoadSU); - AddPred(SuccDep, D); - } - } - - // Add a data dependency to reflect that NewSU reads the value defined - // by LoadSU. - SDep D(LoadSU, SDep::Data, 0); - D.setLatency(LoadSU->Latency); - AddPred(NewSU, D); - - if (isNewLoad) - AvailableQueue->addNode(LoadSU); - AvailableQueue->addNode(NewSU); - - ++NumUnfolds; - - if (NewSU->NumSuccsLeft == 0) { - NewSU->isAvailable = true; - return NewSU; - } - SU = NewSU; + SU = UnfoldSU; + N = SU->getNode(); + // If this can be scheduled don't bother duplicating and just return + if (SU->NumSuccsLeft == 0) + return SU; } DEBUG(dbgs() << " Duplicating SU #" << SU->NodeNum << "\n"); @@ -1265,10 +1280,9 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) { // // If SU is the currently live definition of the same register that it uses, // then we are free to schedule it. - for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - if (I->isAssignedRegDep() && LiveRegDefs[I->getReg()] != SU) - CheckForLiveRegDef(I->getSUnit(), I->getReg(), LiveRegDefs.get(), + for (SDep &Pred : SU->Preds) { + if (Pred.isAssignedRegDep() && LiveRegDefs[Pred.getReg()] != SU) + CheckForLiveRegDef(Pred.getSUnit(), Pred.getReg(), LiveRegDefs.get(), RegAdded, LRegs, TRI); } @@ -1305,7 +1319,8 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) { // If we're in the middle of scheduling a call, don't begin scheduling // another call. Also, don't allow any physical registers to be live across // the call. - if (Node->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) { + if ((Node->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) || + (Node->getMachineOpcode() == TII->getCallFrameSetupOpcode())) { // Check the special calling-sequence resource. unsigned CallResource = TRI->getNumRegs(); if (LiveRegDefs[CallResource]) { @@ -1323,6 +1338,18 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) { RegAdded, LRegs); const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode()); + if (MCID.hasOptionalDef()) { + // Most ARM instructions have an OptionalDef for CPSR, to model the S-bit. + // This operand can be either a def of CPSR, if the S bit is set; or a use + // of %noreg. When the OptionalDef is set to a valid register, we need to + // handle it in the same way as an ImplicitDef. + for (unsigned i = 0; i < MCID.getNumDefs(); ++i) + if (MCID.OpInfo[i].isOptionalDef()) { + const SDValue &OptionalDef = Node->getOperand(i - Node->getNumValues()); + unsigned Reg = cast<RegisterSDNode>(OptionalDef)->getReg(); + CheckForLiveRegDef(SU, Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI); + } + } if (!MCID.ImplicitDefs) continue; for (const MCPhysReg *Reg = MCID.getImplicitDefs(); *Reg; ++Reg) @@ -1659,9 +1686,8 @@ public: RegPressure.resize(NumRC); std::fill(RegLimit.begin(), RegLimit.end(), 0); std::fill(RegPressure.begin(), RegPressure.end(), 0); - for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(), - E = TRI->regclass_end(); I != E; ++I) - RegLimit[(*I)->getID()] = tri->getRegPressureLimit(*I, MF); + for (const TargetRegisterClass *RC : TRI->regclasses()) + RegLimit[RC->getID()] = tri->getRegPressureLimit(RC, MF); } } @@ -1735,8 +1761,7 @@ protected: template<class SF> static SUnit *popFromQueueImpl(std::vector<SUnit*> &Q, SF &Picker) { std::vector<SUnit *>::iterator Best = Q.begin(); - for (std::vector<SUnit *>::iterator I = std::next(Q.begin()), - E = Q.end(); I != E; ++I) + for (auto I = std::next(Q.begin()), E = Q.end(); I != E; ++I) if (Picker(*Best, *I)) Best = I; SUnit *V = *Best; @@ -1788,7 +1813,7 @@ public: } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - void dump(ScheduleDAG *DAG) const override { + LLVM_DUMP_METHOD void dump(ScheduleDAG *DAG) const override { // Emulate pop() without clobbering NodeQueueIds. std::vector<SUnit*> DumpQueue = Queue; SF DumpPicker = Picker; @@ -1836,28 +1861,68 @@ static int checkSpecialNodes(const SUnit *left, const SUnit *right) { /// Smaller number is the higher priority. static unsigned CalcNodeSethiUllmanNumber(const SUnit *SU, std::vector<unsigned> &SUNumbers) { - unsigned &SethiUllmanNumber = SUNumbers[SU->NodeNum]; - if (SethiUllmanNumber != 0) - return SethiUllmanNumber; + if (SUNumbers[SU->NodeNum] != 0) + return SUNumbers[SU->NodeNum]; + + // Use WorkList to avoid stack overflow on excessively large IRs. + struct WorkState { + WorkState(const SUnit *SU) : SU(SU) {} + const SUnit *SU; + unsigned PredsProcessed = 0; + }; - unsigned Extra = 0; - for (const SDep &Pred : SU->Preds) { - if (Pred.isCtrl()) continue; // ignore chain preds - SUnit *PredSU = Pred.getSUnit(); - unsigned PredSethiUllman = CalcNodeSethiUllmanNumber(PredSU, SUNumbers); - if (PredSethiUllman > SethiUllmanNumber) { - SethiUllmanNumber = PredSethiUllman; - Extra = 0; - } else if (PredSethiUllman == SethiUllmanNumber) - ++Extra; - } + SmallVector<WorkState, 16> WorkList; + WorkList.push_back(SU); + while (!WorkList.empty()) { + auto &Temp = WorkList.back(); + auto *TempSU = Temp.SU; + bool AllPredsKnown = true; + // Try to find a non-evaluated pred and push it into the processing stack. + for (unsigned P = Temp.PredsProcessed; P < TempSU->Preds.size(); ++P) { + auto &Pred = TempSU->Preds[P]; + if (Pred.isCtrl()) continue; // ignore chain preds + SUnit *PredSU = Pred.getSUnit(); + if (SUNumbers[PredSU->NodeNum] == 0) { +#ifndef NDEBUG + // In debug mode, check that we don't have such element in the stack. + for (auto It : WorkList) + assert(It.SU != PredSU && "Trying to push an element twice?"); +#endif + // Next time start processing this one starting from the next pred. + Temp.PredsProcessed = P + 1; + WorkList.push_back(PredSU); + AllPredsKnown = false; + break; + } + } - SethiUllmanNumber += Extra; + if (!AllPredsKnown) + continue; - if (SethiUllmanNumber == 0) - SethiUllmanNumber = 1; + // Once all preds are known, we can calculate the answer for this one. + unsigned SethiUllmanNumber = 0; + unsigned Extra = 0; + for (const SDep &Pred : TempSU->Preds) { + if (Pred.isCtrl()) continue; // ignore chain preds + SUnit *PredSU = Pred.getSUnit(); + unsigned PredSethiUllman = SUNumbers[PredSU->NodeNum]; + assert(PredSethiUllman > 0 && "We should have evaluated this pred!"); + if (PredSethiUllman > SethiUllmanNumber) { + SethiUllmanNumber = PredSethiUllman; + Extra = 0; + } else if (PredSethiUllman == SethiUllmanNumber) + ++Extra; + } + + SethiUllmanNumber += Extra; + if (SethiUllmanNumber == 0) + SethiUllmanNumber = 1; + SUNumbers[TempSU->NodeNum] = SethiUllmanNumber; + WorkList.pop_back(); + } - return SethiUllmanNumber; + assert(SUNumbers[SU->NodeNum] > 0 && "SethiUllman should never be zero!"); + return SUNumbers[SU->NodeNum]; } /// CalculateSethiUllmanNumbers - Calculate Sethi-Ullman numbers of all @@ -1924,19 +1989,17 @@ unsigned RegReductionPQBase::getNodePriority(const SUnit *SU) const { // Register Pressure Tracking //===----------------------------------------------------------------------===// -void RegReductionPQBase::dumpRegPressure() const { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(), - E = TRI->regclass_end(); I != E; ++I) { - const TargetRegisterClass *RC = *I; +LLVM_DUMP_METHOD void RegReductionPQBase::dumpRegPressure() const { + for (const TargetRegisterClass *RC : TRI->regclasses()) { unsigned Id = RC->getID(); unsigned RP = RegPressure[Id]; if (!RP) continue; DEBUG(dbgs() << TRI->getRegClassName(RC) << ": " << RP << " / " << RegLimit[Id] << '\n'); } -#endif } +#endif bool RegReductionPQBase::HighRegPressure(const SUnit *SU) const { if (!TLI) @@ -2092,7 +2155,7 @@ void RegReductionPQBase::scheduledNode(SUnit *SU) { RegPressure[RCId] -= Cost; } } - dumpRegPressure(); + DEBUG(dumpRegPressure()); } void RegReductionPQBase::unscheduledNode(SUnit *SU) { @@ -2172,7 +2235,7 @@ void RegReductionPQBase::unscheduledNode(SUnit *SU) { } } - dumpRegPressure(); + DEBUG(dumpRegPressure()); } //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index 3be622f..3c8526e 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -650,6 +650,7 @@ void ScheduleDAGSDNodes::computeOperandLatency(SDNode *Def, SDNode *Use, } void ScheduleDAGSDNodes::dumpNode(const SUnit *SU) const { + // Cannot completely remove virtual function even in release mode. #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) if (!SU->getNode()) { dbgs() << "PHYS REG COPY\n"; @@ -704,8 +705,8 @@ ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter, if (!N->getHasDebugValue()) return; - // Opportunistically insert immediate dbg_value uses, i.e. those with source - // order number right after the N. + // Opportunistically insert immediate dbg_value uses, i.e. those with the same + // source order number as N. MachineBasicBlock *BB = Emitter.getBlock(); MachineBasicBlock::iterator InsertPos = Emitter.getInsertPos(); ArrayRef<SDDbgValue*> DVs = DAG->GetDbgValues(N); @@ -713,7 +714,7 @@ ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter, if (DVs[i]->isInvalidated()) continue; unsigned DVOrder = DVs[i]->getOrder(); - if (!Order || DVOrder == ++Order) { + if (!Order || DVOrder == Order) { MachineInstr *DbgMI = Emitter.EmitDbgValue(DVs[i], VRBaseMap); if (DbgMI) { Orders.push_back(std::make_pair(DVOrder, DbgMI)); @@ -835,8 +836,7 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) { GluedNodes.push_back(N); while (!GluedNodes.empty()) { SDNode *N = GluedNodes.back(); - Emitter.EmitNode(GluedNodes.back(), SU->OrigNode != SU, SU->isCloned, - VRBaseMap); + Emitter.EmitNode(N, SU->OrigNode != SU, SU->isCloned, VRBaseMap); // Remember the source order of the inserted instruction. if (HasDbg) ProcessSourceNode(N, DAG, Emitter, VRBaseMap, Orders, Seen); diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp index eee4a4b..631cb34 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp @@ -18,12 +18,12 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/SchedulerRegistry.h" #include "ScheduleDAGSDNodes.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/LatencyPriorityQueue.h" #include "llvm/CodeGen/ResourcePriorityQueue.h" #include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include "llvm/CodeGen/SchedulerRegistry.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/IR/DataLayout.h" #include "llvm/Support/Debug.h" diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index e225ba8..16f425d 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1,4 +1,4 @@ -//===-- SelectionDAG.cpp - Implement the SelectionDAG data structures -----===// +//===- SelectionDAG.cpp - Implement the SelectionDAG data structures ------===// // // The LLVM Compiler Infrastructure // @@ -13,43 +13,66 @@ #include "llvm/CodeGen/SelectionDAG.h" #include "SDNodeDbgValue.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" #include "llvm/ADT/APSInt.h" -#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/FoldingSet.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Triple.h" +#include "llvm/ADT/Twine.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" -#include "llvm/IR/CallingConv.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" -#include "llvm/IR/GlobalAlias.h" -#include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/Mutex.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetIntrinsicInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <algorithm> -#include <cmath> +#include <cassert> +#include <cstdint> +#include <cstdlib> +#include <limits> +#include <set> +#include <string> #include <utility> +#include <vector> using namespace llvm; @@ -93,7 +116,8 @@ bool ConstantFPSDNode::isValueValidForType(EVT VT, // ISD Namespace //===----------------------------------------------------------------------===// -bool ISD::isConstantSplatVector(const SDNode *N, APInt &SplatVal) { +bool ISD::isConstantSplatVector(const SDNode *N, APInt &SplatVal, + bool AllowShrink) { auto *BV = dyn_cast<BuildVectorSDNode>(N); if (!BV) return false; @@ -101,9 +125,11 @@ bool ISD::isConstantSplatVector(const SDNode *N, APInt &SplatVal) { APInt SplatUndef; unsigned SplatBitSize; bool HasUndefs; - EVT EltVT = N->getValueType(0).getVectorElementType(); - return BV->isConstantSplat(SplatVal, SplatUndef, SplatBitSize, HasUndefs) && - EltVT.getSizeInBits() >= SplatBitSize; + unsigned EltSize = N->getValueType(0).getVectorElementType().getSizeInBits(); + unsigned MinSplatBits = AllowShrink ? 0 : EltSize; + return BV->isConstantSplat(SplatVal, SplatUndef, SplatBitSize, HasUndefs, + MinSplatBits) && + EltSize >= SplatBitSize; } // FIXME: AllOnes and AllZeros duplicate a lot of code. Could these be @@ -268,7 +294,6 @@ ISD::CondCode ISD::getSetCCInverse(ISD::CondCode Op, bool isInteger) { return ISD::CondCode(Operation); } - /// For an integer comparison, return 1 if the comparison is a signed operation /// and 2 if the result is an unsigned comparison. Return zero if the operation /// does not depend on the sign of the input (setne and seteq). @@ -289,28 +314,28 @@ static int isSignedOp(ISD::CondCode Opcode) { } ISD::CondCode ISD::getSetCCOrOperation(ISD::CondCode Op1, ISD::CondCode Op2, - bool isInteger) { - if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3) + bool IsInteger) { + if (IsInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3) // Cannot fold a signed integer setcc with an unsigned integer setcc. return ISD::SETCC_INVALID; unsigned Op = Op1 | Op2; // Combine all of the condition bits. - // If the N and U bits get set then the resultant comparison DOES suddenly - // care about orderedness, and is true when ordered. + // If the N and U bits get set, then the resultant comparison DOES suddenly + // care about orderedness, and it is true when ordered. if (Op > ISD::SETTRUE2) Op &= ~16; // Clear the U bit if the N bit is set. // Canonicalize illegal integer setcc's. - if (isInteger && Op == ISD::SETUNE) // e.g. SETUGT | SETULT + if (IsInteger && Op == ISD::SETUNE) // e.g. SETUGT | SETULT Op = ISD::SETNE; return ISD::CondCode(Op); } ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2, - bool isInteger) { - if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3) + bool IsInteger) { + if (IsInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3) // Cannot fold a signed setcc with an unsigned setcc. return ISD::SETCC_INVALID; @@ -318,7 +343,7 @@ ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2, ISD::CondCode Result = ISD::CondCode(Op1 & Op2); // Canonicalize illegal integer setcc's. - if (isInteger) { + if (IsInteger) { switch (Result) { default: break; case ISD::SETUO : Result = ISD::SETFALSE; break; // SETUGT & SETULT @@ -337,7 +362,6 @@ ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2, //===----------------------------------------------------------------------===// /// AddNodeIDOpcode - Add the node opcode to the NodeID data. -/// static void AddNodeIDOpcode(FoldingSetNodeID &ID, unsigned OpC) { ID.AddInteger(OpC); } @@ -349,7 +373,6 @@ static void AddNodeIDValueTypes(FoldingSetNodeID &ID, SDVTList VTList) { } /// AddNodeIDOperands - Various routines for adding operands to the NodeID data. -/// static void AddNodeIDOperands(FoldingSetNodeID &ID, ArrayRef<SDValue> Ops) { for (auto& Op : Ops) { @@ -359,7 +382,6 @@ static void AddNodeIDOperands(FoldingSetNodeID &ID, } /// AddNodeIDOperands - Various routines for adding operands to the NodeID data. -/// static void AddNodeIDOperands(FoldingSetNodeID &ID, ArrayRef<SDUse> Ops) { for (auto& Op : Ops) { @@ -391,10 +413,9 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) { break; } case ISD::TargetConstantFP: - case ISD::ConstantFP: { + case ISD::ConstantFP: ID.AddPointer(cast<ConstantFPSDNode>(N)->getConstantFPValue()); break; - } case ISD::TargetGlobalAddress: case ISD::GlobalAddress: case ISD::TargetGlobalTLSAddress: @@ -572,6 +593,11 @@ void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes) { // worklist. while (!DeadNodes.empty()) { SDNode *N = DeadNodes.pop_back_val(); + // Skip to next node if we've already managed to delete the node. This could + // happen if replacing a node causes a node previously added to the node to + // be deleted. + if (N->getOpcode() == ISD::DELETED_NODE) + continue; for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next) DUL->NodeDeleted(N, nullptr); @@ -639,12 +665,15 @@ void SelectionDAG::DeallocateNode(SDNode *N) { // If we have operands, deallocate them. removeOperands(N); + NodeAllocator.Deallocate(AllNodes.remove(N)); + // Set the opcode to DELETED_NODE to help catch bugs when node // memory is reallocated. + // FIXME: There are places in SDag that have grown a dependency on the opcode + // value in the released node. + __asan_unpoison_memory_region(&N->NodeType, sizeof(N->NodeType)); N->NodeType = ISD::DELETED_NODE; - NodeAllocator.Deallocate(AllNodes.remove(N)); - // If any of the SDDbgValue nodes refer to this SDNode, invalidate // them and forget about that node. DbgInfo->erase(N); @@ -766,7 +795,6 @@ bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) { /// maps and modified in place. Add it back to the CSE maps, unless an identical /// node already exists, in which case transfer all its users to the existing /// node. This transfer can potentially trigger recursive merging. -/// void SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N) { // For node types that aren't CSE'd, just act as if no identical node @@ -807,8 +835,7 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, SDValue Op, AddNodeIDCustom(ID, N); SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos); if (Node) - if (const SDNodeFlags *Flags = N->getFlags()) - Node->intersectFlagsWith(Flags); + Node->intersectFlagsWith(N->getFlags()); return Node; } @@ -828,12 +855,10 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, AddNodeIDCustom(ID, N); SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos); if (Node) - if (const SDNodeFlags *Flags = N->getFlags()) - Node->intersectFlagsWith(Flags); + Node->intersectFlagsWith(N->getFlags()); return Node; } - /// FindModifiedNodeSlot - Find a slot for the specified node if its operands /// were replaced with those specified. If this node is never memoized, /// return null, otherwise return a pointer to the slot it would take. If a @@ -848,8 +873,7 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, ArrayRef<SDValue> Ops, AddNodeIDCustom(ID, N); SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos); if (Node) - if (const SDNodeFlags *Flags = N->getFlags()) - Node->intersectFlagsWith(Flags); + Node->intersectFlagsWith(N->getFlags()); return Node; } @@ -863,19 +887,20 @@ unsigned SelectionDAG::getEVTAlignment(EVT VT) const { // EntryNode could meaningfully have debug info if we can find it... SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL) - : TM(tm), TSI(nullptr), TLI(nullptr), OptLevel(OL), + : TM(tm), OptLevel(OL), EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)), - Root(getEntryNode()), NewNodesMustHaveLegalTypes(false), - UpdateListeners(nullptr) { + Root(getEntryNode()) { InsertNode(&EntryNode); DbgInfo = new SDDbgInfo(); } -void SelectionDAG::init(MachineFunction &mf) { - MF = &mf; +void SelectionDAG::init(MachineFunction &NewMF, + OptimizationRemarkEmitter &NewORE) { + MF = &NewMF; + ORE = &NewORE; TLI = getSubtarget().getTargetLowering(); TSI = getSubtarget().getSelectionDAGInfo(); - Context = &mf.getFunction()->getContext(); + Context = &MF->getFunction()->getContext(); } SelectionDAG::~SelectionDAG() { @@ -895,29 +920,6 @@ void SelectionDAG::allnodes_clear() { #endif } -SDNode *SelectionDAG::GetBinarySDNode(unsigned Opcode, const SDLoc &DL, - SDVTList VTs, SDValue N1, SDValue N2, - const SDNodeFlags *Flags) { - SDValue Ops[] = {N1, N2}; - - if (isBinOpWithFlags(Opcode)) { - // If no flags were passed in, use a default flags object. - SDNodeFlags F; - if (Flags == nullptr) - Flags = &F; - - auto *FN = newSDNode<BinaryWithFlagsSDNode>(Opcode, DL.getIROrder(), - DL.getDebugLoc(), VTs, *Flags); - createOperands(FN, Ops); - - return FN; - } - - auto *N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs); - createOperands(N, Ops); - return N; -} - SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID, void *&InsertPos) { SDNode *N = CSEMap.FindNodeOrInsertPos(ID, InsertPos); @@ -979,6 +981,12 @@ void SelectionDAG::clear() { DbgInfo->clear(); } +SDValue SelectionDAG::getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT) { + return VT.bitsGT(Op.getValueType()) + ? getNode(ISD::FP_EXTEND, DL, VT, Op) + : getNode(ISD::FP_ROUND, DL, VT, Op, getIntPtrConstant(0, DL)); +} + SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) { return VT.bitsGT(Op.getValueType()) ? getNode(ISD::ANY_EXTEND, DL, VT, Op) : @@ -1052,7 +1060,6 @@ SDValue SelectionDAG::getZeroExtendVectorInReg(SDValue Op, const SDLoc &DL, } /// getNOT - Create a bitwise NOT operation as (XOR Val, -1). -/// SDValue SelectionDAG::getNOT(const SDLoc &DL, SDValue Val, EVT VT) { EVT EltVT = VT.getScalarType(); SDValue NegOne = @@ -1331,7 +1338,6 @@ SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT, return SDValue(N, 0); } - SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT, unsigned Alignment, int Offset, bool isTarget, @@ -1465,7 +1471,7 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, // Validate that all indices in Mask are within the range of the elements // input to the shuffle. int NElts = Mask.size(); - assert(all_of(Mask, [&](int M) { return M < (NElts * 2); }) && + assert(llvm::all_of(Mask, [&](int M) { return M < (NElts * 2); }) && "Index out of range"); // Copy the mask so we can do any needed cleanup. @@ -1824,7 +1830,7 @@ SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) { std::max((unsigned)getDataLayout().getPrefTypeAlignment(Ty), minAlign); int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); - return getFrameIndex(FrameIdx, TLI->getPointerTy(getDataLayout())); + return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout())); } SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) { @@ -1837,7 +1843,7 @@ SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) { MachineFrameInfo &MFI = getMachineFunction().getFrameInfo(); int FrameIdx = MFI.CreateStackObject(Bytes, Align, false); - return getFrameIndex(FrameIdx, TLI->getPointerTy(getDataLayout())); + return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout())); } SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2, @@ -1953,7 +1959,7 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2, /// use this predicate to simplify operations downstream. bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const { unsigned BitWidth = Op.getScalarValueSizeInBits(); - return MaskedValueIsZero(Op, APInt::getSignBit(BitWidth), Depth); + return MaskedValueIsZero(Op, APInt::getSignMask(BitWidth), Depth); } /// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero. We use @@ -1961,9 +1967,9 @@ bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const { /// for bits that V cannot have. bool SelectionDAG::MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth) const { - APInt KnownZero, KnownOne; - computeKnownBits(Op, KnownZero, KnownOne, Depth); - return (KnownZero & Mask) == Mask; + KnownBits Known; + computeKnownBits(Op, Known, Depth); + return Mask.isSubsetOf(Known.Zero); } /// If a SHL/SRA/SRL node has a constant or splat constant shift amount that @@ -1979,33 +1985,30 @@ static const APInt *getValidShiftAmountConstant(SDValue V) { } /// Determine which bits of Op are known to be either zero or one and return -/// them in the KnownZero/KnownOne bitsets. For vectors, the known bits are -/// those that are shared by every vector element. -void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, - APInt &KnownOne, unsigned Depth) const { +/// them in Known. For vectors, the known bits are those that are shared by +/// every vector element. +void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known, + unsigned Depth) const { EVT VT = Op.getValueType(); APInt DemandedElts = VT.isVector() ? APInt::getAllOnesValue(VT.getVectorNumElements()) : APInt(1, 1); - computeKnownBits(Op, KnownZero, KnownOne, DemandedElts, Depth); + computeKnownBits(Op, Known, DemandedElts, Depth); } /// Determine which bits of Op are known to be either zero or one and return -/// them in the KnownZero/KnownOne bitsets. The DemandedElts argument allows -/// us to only collect the known bits that are shared by the requested vector -/// elements. -/// TODO: We only support DemandedElts on a few opcodes so far, the remainder -/// should be added when they become necessary. -void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, - APInt &KnownOne, const APInt &DemandedElts, +/// them in Known. The DemandedElts argument allows us to only collect the known +/// bits that are shared by the requested vector elements. +void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known, + const APInt &DemandedElts, unsigned Depth) const { unsigned BitWidth = Op.getScalarValueSizeInBits(); - KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. + Known = KnownBits(BitWidth); // Don't know anything. if (Depth == 6) return; // Limit search depth. - APInt KnownZero2, KnownOne2; + KnownBits Known2; unsigned NumElts = DemandedElts.getBitWidth(); if (!DemandedElts) @@ -2015,35 +2018,34 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, switch (Opcode) { case ISD::Constant: // We know all of the bits for a constant! - KnownOne = cast<ConstantSDNode>(Op)->getAPIntValue(); - KnownZero = ~KnownOne; + Known.One = cast<ConstantSDNode>(Op)->getAPIntValue(); + Known.Zero = ~Known.One; break; case ISD::BUILD_VECTOR: // Collect the known bits that are shared by every demanded vector element. assert(NumElts == Op.getValueType().getVectorNumElements() && "Unexpected vector size"); - KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth); + Known.Zero.setAllBits(); Known.One.setAllBits(); for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { if (!DemandedElts[i]) continue; SDValue SrcOp = Op.getOperand(i); - computeKnownBits(SrcOp, KnownZero2, KnownOne2, Depth + 1); + computeKnownBits(SrcOp, Known2, Depth + 1); // BUILD_VECTOR can implicitly truncate sources, we must handle this. if (SrcOp.getValueSizeInBits() != BitWidth) { assert(SrcOp.getValueSizeInBits() > BitWidth && "Expected BUILD_VECTOR implicit truncation"); - KnownOne2 = KnownOne2.trunc(BitWidth); - KnownZero2 = KnownZero2.trunc(BitWidth); + Known2 = Known2.trunc(BitWidth); } // Known bits are the values that are shared by every demanded element. - KnownOne &= KnownOne2; - KnownZero &= KnownZero2; + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; // If we don't know any bits, early out. - if (!KnownOne && !KnownZero) + if (!Known.One && !Known.Zero) break; } break; @@ -2051,7 +2053,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, // Collect the known bits that are shared by every vector element referenced // by the shuffle. APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0); - KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth); + Known.Zero.setAllBits(); Known.One.setAllBits(); const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op); assert(NumElts == SVN->getMask().size() && "Unexpected vector size"); for (unsigned i = 0; i != NumElts; ++i) { @@ -2062,8 +2064,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, if (M < 0) { // For UNDEF elements, we don't know anything about the common state of // the shuffle result. - KnownOne.clearAllBits(); - KnownZero.clearAllBits(); + Known.resetAll(); DemandedLHS.clearAllBits(); DemandedRHS.clearAllBits(); break; @@ -2077,24 +2078,24 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, // Known bits are the values that are shared by every demanded element. if (!!DemandedLHS) { SDValue LHS = Op.getOperand(0); - computeKnownBits(LHS, KnownZero2, KnownOne2, DemandedLHS, Depth + 1); - KnownOne &= KnownOne2; - KnownZero &= KnownZero2; + computeKnownBits(LHS, Known2, DemandedLHS, Depth + 1); + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; } // If we don't know any bits, early out. - if (!KnownOne && !KnownZero) + if (!Known.One && !Known.Zero) break; if (!!DemandedRHS) { SDValue RHS = Op.getOperand(1); - computeKnownBits(RHS, KnownZero2, KnownOne2, DemandedRHS, Depth + 1); - KnownOne &= KnownOne2; - KnownZero &= KnownZero2; + computeKnownBits(RHS, Known2, DemandedRHS, Depth + 1); + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; } break; } case ISD::CONCAT_VECTORS: { // Split DemandedElts and test each of the demanded subvectors. - KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth); + Known.Zero.setAllBits(); Known.One.setAllBits(); EVT SubVectorVT = Op.getOperand(0).getValueType(); unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements(); unsigned NumSubVectors = Op.getNumOperands(); @@ -2103,12 +2104,12 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, DemandedSub = DemandedSub.trunc(NumSubVectorElts); if (!!DemandedSub) { SDValue Sub = Op.getOperand(i); - computeKnownBits(Sub, KnownZero2, KnownOne2, DemandedSub, Depth + 1); - KnownOne &= KnownOne2; - KnownZero &= KnownZero2; + computeKnownBits(Sub, Known2, DemandedSub, Depth + 1); + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; } // If we don't know any bits, early out. - if (!KnownOne && !KnownZero) + if (!Known.One && !Known.Zero) break; } break; @@ -2123,9 +2124,9 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, // Offset the demanded elts by the subvector index. uint64_t Idx = SubIdx->getZExtValue(); APInt DemandedSrc = DemandedElts.zext(NumSrcElts).shl(Idx); - computeKnownBits(Src, KnownZero, KnownOne, DemandedSrc, Depth + 1); + computeKnownBits(Src, Known, DemandedSrc, Depth + 1); } else { - computeKnownBits(Src, KnownZero, KnownOne, Depth + 1); + computeKnownBits(Src, Known, Depth + 1); } break; } @@ -2139,7 +2140,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, // Fast handling of 'identity' bitcasts. if (BitWidth == SubBitWidth) { - computeKnownBits(N0, KnownZero, KnownOne, DemandedElts, Depth + 1); + computeKnownBits(N0, Known, DemandedElts, Depth + 1); break; } @@ -2163,10 +2164,10 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, SubDemandedElts.setBit(i * SubScale); for (unsigned i = 0; i != SubScale; ++i) { - computeKnownBits(N0, KnownZero2, KnownOne2, SubDemandedElts.shl(i), + computeKnownBits(N0, Known2, SubDemandedElts.shl(i), Depth + 1); - KnownOne |= KnownOne2.zext(BitWidth).shl(SubBitWidth * i); - KnownZero |= KnownZero2.zext(BitWidth).shl(SubBitWidth * i); + Known.One |= Known2.One.zext(BitWidth).shl(SubBitWidth * i); + Known.Zero |= Known2.Zero.zext(BitWidth).shl(SubBitWidth * i); } } @@ -2183,16 +2184,16 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, if (DemandedElts[i]) SubDemandedElts.setBit(i / SubScale); - computeKnownBits(N0, KnownZero2, KnownOne2, SubDemandedElts, Depth + 1); + computeKnownBits(N0, Known2, SubDemandedElts, Depth + 1); - KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth); + Known.Zero.setAllBits(); Known.One.setAllBits(); for (unsigned i = 0; i != NumElts; ++i) if (DemandedElts[i]) { unsigned Offset = (i % SubScale) * BitWidth; - KnownOne &= KnownOne2.lshr(Offset).trunc(BitWidth); - KnownZero &= KnownZero2.lshr(Offset).trunc(BitWidth); + Known.One &= Known2.One.lshr(Offset).trunc(BitWidth); + Known.Zero &= Known2.Zero.lshr(Offset).trunc(BitWidth); // If we don't know any bits, early out. - if (!KnownOne && !KnownZero) + if (!Known.One && !Known.Zero) break; } } @@ -2200,107 +2201,90 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, } case ISD::AND: // If either the LHS or the RHS are Zero, the result is zero. - computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, DemandedElts, - Depth + 1); - computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, - Depth + 1); + computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1); + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); // Output known-1 bits are only known if set in both the LHS & RHS. - KnownOne &= KnownOne2; + Known.One &= Known2.One; // Output known-0 are known to be clear if zero in either the LHS | RHS. - KnownZero |= KnownZero2; + Known.Zero |= Known2.Zero; break; case ISD::OR: - computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, DemandedElts, - Depth + 1); - computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, - Depth + 1); + computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1); + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); // Output known-0 bits are only known if clear in both the LHS & RHS. - KnownZero &= KnownZero2; + Known.Zero &= Known2.Zero; // Output known-1 are known to be set if set in either the LHS | RHS. - KnownOne |= KnownOne2; + Known.One |= Known2.One; break; case ISD::XOR: { - computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, DemandedElts, - Depth + 1); - computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, - Depth + 1); + computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1); + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); // Output known-0 bits are known if clear or set in both the LHS & RHS. - APInt KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2); + APInt KnownZeroOut = (Known.Zero & Known2.Zero) | (Known.One & Known2.One); // Output known-1 are known to be set if set in only one of the LHS, RHS. - KnownOne = (KnownZero & KnownOne2) | (KnownOne & KnownZero2); - KnownZero = KnownZeroOut; + Known.One = (Known.Zero & Known2.One) | (Known.One & Known2.Zero); + Known.Zero = KnownZeroOut; break; } case ISD::MUL: { - computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, DemandedElts, - Depth + 1); - computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, - Depth + 1); + computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1); + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); // If low bits are zero in either operand, output low known-0 bits. // Also compute a conservative estimate for high known-0 bits. // More trickiness is possible, but this is sufficient for the // interesting case of alignment computation. - KnownOne.clearAllBits(); - unsigned TrailZ = KnownZero.countTrailingOnes() + - KnownZero2.countTrailingOnes(); - unsigned LeadZ = std::max(KnownZero.countLeadingOnes() + - KnownZero2.countLeadingOnes(), + unsigned TrailZ = Known.countMinTrailingZeros() + + Known2.countMinTrailingZeros(); + unsigned LeadZ = std::max(Known.countMinLeadingZeros() + + Known2.countMinLeadingZeros(), BitWidth) - BitWidth; - TrailZ = std::min(TrailZ, BitWidth); - LeadZ = std::min(LeadZ, BitWidth); - KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) | - APInt::getHighBitsSet(BitWidth, LeadZ); + Known.resetAll(); + Known.Zero.setLowBits(std::min(TrailZ, BitWidth)); + Known.Zero.setHighBits(std::min(LeadZ, BitWidth)); break; } case ISD::UDIV: { // For the purposes of computing leading zeros we can conservatively // treat a udiv as a logical right shift by the power of 2 known to // be less than the denominator. - computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, - Depth + 1); - unsigned LeadZ = KnownZero2.countLeadingOnes(); + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); + unsigned LeadZ = Known2.countMinLeadingZeros(); - computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts, - Depth + 1); - unsigned RHSUnknownLeadingOnes = KnownOne2.countLeadingZeros(); - if (RHSUnknownLeadingOnes != BitWidth) - LeadZ = std::min(BitWidth, - LeadZ + BitWidth - RHSUnknownLeadingOnes - 1); + computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1); + unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros(); + if (RHSMaxLeadingZeros != BitWidth) + LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1); - KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ); + Known.Zero.setHighBits(LeadZ); break; } case ISD::SELECT: - computeKnownBits(Op.getOperand(2), KnownZero, KnownOne, Depth+1); + computeKnownBits(Op.getOperand(2), Known, Depth+1); // If we don't know any bits, early out. - if (!KnownOne && !KnownZero) + if (!Known.One && !Known.Zero) break; - computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1); + computeKnownBits(Op.getOperand(1), Known2, Depth+1); // Only known if known in both the LHS and RHS. - KnownOne &= KnownOne2; - KnownZero &= KnownZero2; + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; break; case ISD::SELECT_CC: - computeKnownBits(Op.getOperand(3), KnownZero, KnownOne, Depth+1); + computeKnownBits(Op.getOperand(3), Known, Depth+1); // If we don't know any bits, early out. - if (!KnownOne && !KnownZero) + if (!Known.One && !Known.Zero) break; - computeKnownBits(Op.getOperand(2), KnownZero2, KnownOne2, Depth+1); + computeKnownBits(Op.getOperand(2), Known2, Depth+1); // Only known if known in both the LHS and RHS. - KnownOne &= KnownOne2; - KnownZero &= KnownZero2; + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; break; - case ISD::SADDO: - case ISD::UADDO: - case ISD::SSUBO: - case ISD::USUBO: case ISD::SMULO: case ISD::UMULO: if (Op.getResNo() != 1) @@ -2312,51 +2296,46 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, if (TLI->getBooleanContents(Op.getValueType().isVector(), false) == TargetLowering::ZeroOrOneBooleanContent && BitWidth > 1) - KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); + Known.Zero.setBitsFrom(1); break; case ISD::SETCC: // If we know the result of a setcc has the top bits zero, use this info. if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) == TargetLowering::ZeroOrOneBooleanContent && BitWidth > 1) - KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); + Known.Zero.setBitsFrom(1); break; case ISD::SHL: if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) { - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, - Depth + 1); - KnownZero = KnownZero << *ShAmt; - KnownOne = KnownOne << *ShAmt; + computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); + Known.Zero <<= *ShAmt; + Known.One <<= *ShAmt; // Low bits are known zero. - KnownZero |= APInt::getLowBitsSet(BitWidth, ShAmt->getZExtValue()); + Known.Zero.setLowBits(ShAmt->getZExtValue()); } break; case ISD::SRL: if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) { - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, - Depth + 1); - KnownZero = KnownZero.lshr(*ShAmt); - KnownOne = KnownOne.lshr(*ShAmt); + computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); + Known.Zero.lshrInPlace(*ShAmt); + Known.One.lshrInPlace(*ShAmt); // High bits are known zero. - APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt->getZExtValue()); - KnownZero |= HighBits; + Known.Zero.setHighBits(ShAmt->getZExtValue()); } break; case ISD::SRA: if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) { - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, - Depth + 1); - KnownZero = KnownZero.lshr(*ShAmt); - KnownOne = KnownOne.lshr(*ShAmt); + computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); + Known.Zero.lshrInPlace(*ShAmt); + Known.One.lshrInPlace(*ShAmt); // If we know the value of the sign bit, then we know it is copied across // the high bits by the shift amount. - APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt->getZExtValue()); - APInt SignBit = APInt::getSignBit(BitWidth); - SignBit = SignBit.lshr(*ShAmt); // Adjust to where it is now in the mask. - if (KnownZero.intersects(SignBit)) { - KnownZero |= HighBits; // New bits are known zero. - } else if (KnownOne.intersects(SignBit)) { - KnownOne |= HighBits; // New bits are known one. + APInt SignMask = APInt::getSignMask(BitWidth); + SignMask.lshrInPlace(*ShAmt); // Adjust to where it is now in the mask. + if (Known.Zero.intersects(SignMask)) { + Known.Zero.setHighBits(ShAmt->getZExtValue());// New bits are known zero. + } else if (Known.One.intersects(SignMask)) { + Known.One.setHighBits(ShAmt->getZExtValue()); // New bits are known one. } } break; @@ -2368,42 +2347,56 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, // present in the input. APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - EBits); - APInt InSignBit = APInt::getSignBit(EBits); + APInt InSignMask = APInt::getSignMask(EBits); APInt InputDemandedBits = APInt::getLowBitsSet(BitWidth, EBits); // If the sign extended bits are demanded, we know that the sign // bit is demanded. - InSignBit = InSignBit.zext(BitWidth); + InSignMask = InSignMask.zext(BitWidth); if (NewBits.getBoolValue()) - InputDemandedBits |= InSignBit; + InputDemandedBits |= InSignMask; - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, - Depth + 1); - KnownOne &= InputDemandedBits; - KnownZero &= InputDemandedBits; + computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); + Known.One &= InputDemandedBits; + Known.Zero &= InputDemandedBits; // If the sign bit of the input is known set or clear, then we know the // top bits of the result. - if (KnownZero.intersects(InSignBit)) { // Input sign bit known clear - KnownZero |= NewBits; - KnownOne &= ~NewBits; - } else if (KnownOne.intersects(InSignBit)) { // Input sign bit known set - KnownOne |= NewBits; - KnownZero &= ~NewBits; + if (Known.Zero.intersects(InSignMask)) { // Input sign bit known clear + Known.Zero |= NewBits; + Known.One &= ~NewBits; + } else if (Known.One.intersects(InSignMask)) { // Input sign bit known set + Known.One |= NewBits; + Known.Zero &= ~NewBits; } else { // Input sign bit unknown - KnownZero &= ~NewBits; - KnownOne &= ~NewBits; + Known.Zero &= ~NewBits; + Known.One &= ~NewBits; } break; } case ISD::CTTZ: - case ISD::CTTZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_UNDEF: { + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); + // If we have a known 1, its position is our upper bound. + unsigned PossibleTZ = Known2.countMaxTrailingZeros(); + unsigned LowBits = Log2_32(PossibleTZ) + 1; + Known.Zero.setBitsFrom(LowBits); + break; + } case ISD::CTLZ: - case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTLZ_ZERO_UNDEF: { + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); + // If we have a known 1, its position is our upper bound. + unsigned PossibleLZ = Known2.countMaxLeadingZeros(); + unsigned LowBits = Log2_32(PossibleLZ) + 1; + Known.Zero.setBitsFrom(LowBits); + break; + } case ISD::CTPOP: { - unsigned LowBits = Log2_32(BitWidth)+1; - KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits); - KnownOne.clearAllBits(); + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); + // If we know some of the bits are zero, they can't be one. + unsigned PossibleOnes = Known2.countMaxPopulation(); + Known.Zero.setBitsFrom(Log2_32(PossibleOnes) + 1); break; } case ISD::LOAD: { @@ -2412,76 +2405,87 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) { EVT VT = LD->getMemoryVT(); unsigned MemBits = VT.getScalarSizeInBits(); - KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); + Known.Zero.setBitsFrom(MemBits); } else if (const MDNode *Ranges = LD->getRanges()) { if (LD->getExtensionType() == ISD::NON_EXTLOAD) - computeKnownBitsFromRangeMetadata(*Ranges, KnownZero, KnownOne); + computeKnownBitsFromRangeMetadata(*Ranges, Known); } break; } - case ISD::ZERO_EXTEND: { + case ISD::ZERO_EXTEND_VECTOR_INREG: { EVT InVT = Op.getOperand(0).getValueType(); unsigned InBits = InVT.getScalarSizeInBits(); - APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - InBits); - KnownZero = KnownZero.trunc(InBits); - KnownOne = KnownOne.trunc(InBits); - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, + Known = Known.trunc(InBits); + computeKnownBits(Op.getOperand(0), Known, + DemandedElts.zext(InVT.getVectorNumElements()), Depth + 1); - KnownZero = KnownZero.zext(BitWidth); - KnownOne = KnownOne.zext(BitWidth); - KnownZero |= NewBits; + Known = Known.zext(BitWidth); + Known.Zero.setBitsFrom(InBits); break; } + case ISD::ZERO_EXTEND: { + EVT InVT = Op.getOperand(0).getValueType(); + unsigned InBits = InVT.getScalarSizeInBits(); + Known = Known.trunc(InBits); + computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); + Known = Known.zext(BitWidth); + Known.Zero.setBitsFrom(InBits); + break; + } + // TODO ISD::SIGN_EXTEND_VECTOR_INREG case ISD::SIGN_EXTEND: { EVT InVT = Op.getOperand(0).getValueType(); unsigned InBits = InVT.getScalarSizeInBits(); - KnownZero = KnownZero.trunc(InBits); - KnownOne = KnownOne.trunc(InBits); - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, - Depth + 1); + Known = Known.trunc(InBits); + computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); // If the sign bit is known to be zero or one, then sext will extend // it to the top bits, else it will just zext. - KnownZero = KnownZero.sext(BitWidth); - KnownOne = KnownOne.sext(BitWidth); + Known = Known.sext(BitWidth); break; } case ISD::ANY_EXTEND: { EVT InVT = Op.getOperand(0).getValueType(); unsigned InBits = InVT.getScalarSizeInBits(); - KnownZero = KnownZero.trunc(InBits); - KnownOne = KnownOne.trunc(InBits); - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); - KnownZero = KnownZero.zext(BitWidth); - KnownOne = KnownOne.zext(BitWidth); + Known = Known.trunc(InBits); + computeKnownBits(Op.getOperand(0), Known, Depth+1); + Known = Known.zext(BitWidth); break; } case ISD::TRUNCATE: { EVT InVT = Op.getOperand(0).getValueType(); unsigned InBits = InVT.getScalarSizeInBits(); - KnownZero = KnownZero.zext(InBits); - KnownOne = KnownOne.zext(InBits); - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, - Depth + 1); - KnownZero = KnownZero.trunc(BitWidth); - KnownOne = KnownOne.trunc(BitWidth); + Known = Known.zext(InBits); + computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); + Known = Known.trunc(BitWidth); break; } case ISD::AssertZext: { EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT(); APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits()); - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); - KnownZero |= (~InMask); - KnownOne &= (~KnownZero); + computeKnownBits(Op.getOperand(0), Known, Depth+1); + Known.Zero |= (~InMask); + Known.One &= (~Known.Zero); break; } case ISD::FGETSIGN: // All bits are zero except the low bit. - KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 1); + Known.Zero.setBitsFrom(1); break; - - case ISD::SUB: { + case ISD::USUBO: + case ISD::SSUBO: + if (Op.getResNo() == 1) { + // If we know the result of a setcc has the top bits zero, use this info. + if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) == + TargetLowering::ZeroOrOneBooleanContent && + BitWidth > 1) + Known.Zero.setBitsFrom(1); + break; + } + LLVM_FALLTHROUGH; + case ISD::SUB: + case ISD::SUBC: { if (ConstantSDNode *CLHS = isConstOrConstSplat(Op.getOperand(0))) { // We know that the top bits of C-X are clear if X contains less bits // than C (i.e. no wrap-around can happen). For example, 20-X is @@ -2490,22 +2494,47 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, unsigned NLZ = (CLHS->getAPIntValue()+1).countLeadingZeros(); // NLZ can't be BitWidth with no sign bit APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1); - computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts, + computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1); // If all of the MaskV bits are known to be zero, then we know the // output top bits are zero, because we now know that the output is // from [0-C]. - if ((KnownZero2 & MaskV) == MaskV) { + if ((Known2.Zero & MaskV) == MaskV) { unsigned NLZ2 = CLHS->getAPIntValue().countLeadingZeros(); // Top bits known zero. - KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2); + Known.Zero.setHighBits(NLZ2); } } } - LLVM_FALLTHROUGH; + + // If low bits are know to be zero in both operands, then we know they are + // going to be 0 in the result. Both addition and complement operations + // preserve the low zero bits. + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); + unsigned KnownZeroLow = Known2.countMinTrailingZeros(); + if (KnownZeroLow == 0) + break; + + computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1); + KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros()); + Known.Zero.setLowBits(KnownZeroLow); + break; } + case ISD::UADDO: + case ISD::SADDO: + case ISD::ADDCARRY: + if (Op.getResNo() == 1) { + // If we know the result of a setcc has the top bits zero, use this info. + if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) == + TargetLowering::ZeroOrOneBooleanContent && + BitWidth > 1) + Known.Zero.setBitsFrom(1); + break; + } + LLVM_FALLTHROUGH; case ISD::ADD: + case ISD::ADDC: case ISD::ADDE: { // Output known-0 bits are known if clear or set in both the low clear bits // common to both LHS & RHS. For example, 8+(X<<3) is known to have the @@ -2514,31 +2543,28 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, // known to be clear. For example, if one input has the top 10 bits clear // and the other has the top 8 bits clear, we know the top 7 bits of the // output must be clear. - computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, - Depth + 1); - unsigned KnownZeroHigh = KnownZero2.countLeadingOnes(); - unsigned KnownZeroLow = KnownZero2.countTrailingOnes(); + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); + unsigned KnownZeroHigh = Known2.countMinLeadingZeros(); + unsigned KnownZeroLow = Known2.countMinTrailingZeros(); - computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts, + computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1); - KnownZeroHigh = std::min(KnownZeroHigh, - KnownZero2.countLeadingOnes()); - KnownZeroLow = std::min(KnownZeroLow, - KnownZero2.countTrailingOnes()); - - if (Opcode == ISD::ADD) { - KnownZero |= APInt::getLowBitsSet(BitWidth, KnownZeroLow); - if (KnownZeroHigh > 1) - KnownZero |= APInt::getHighBitsSet(BitWidth, KnownZeroHigh - 1); + KnownZeroHigh = std::min(KnownZeroHigh, Known2.countMinLeadingZeros()); + KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros()); + + if (Opcode == ISD::ADDE || Opcode == ISD::ADDCARRY) { + // With ADDE and ADDCARRY, a carry bit may be added in, so we can only + // use this information if we know (at least) that the low two bits are + // clear. We then return to the caller that the low bit is unknown but + // that other bits are known zero. + if (KnownZeroLow >= 2) + Known.Zero.setBits(1, KnownZeroLow); break; } - // With ADDE, a carry bit may be added in, so we can only use this - // information if we know (at least) that the low two bits are clear. We - // then return to the caller that the low bit is unknown but that other bits - // are known zero. - if (KnownZeroLow >= 2) // ADDE - KnownZero |= APInt::getBitsSet(BitWidth, 1, KnownZeroLow); + Known.Zero.setLowBits(KnownZeroLow); + if (KnownZeroHigh > 1) + Known.Zero.setHighBits(KnownZeroHigh - 1); break; } case ISD::SREM: @@ -2546,23 +2572,22 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, const APInt &RA = Rem->getAPIntValue().abs(); if (RA.isPowerOf2()) { APInt LowBits = RA - 1; - computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, - Depth + 1); + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); // The low bits of the first operand are unchanged by the srem. - KnownZero = KnownZero2 & LowBits; - KnownOne = KnownOne2 & LowBits; + Known.Zero = Known2.Zero & LowBits; + Known.One = Known2.One & LowBits; // If the first operand is non-negative or has all low bits zero, then // the upper bits are all zero. - if (KnownZero2[BitWidth-1] || ((KnownZero2 & LowBits) == LowBits)) - KnownZero |= ~LowBits; + if (Known2.Zero[BitWidth-1] || ((Known2.Zero & LowBits) == LowBits)) + Known.Zero |= ~LowBits; // If the first operand is negative and not all low bits are zero, then // the upper bits are all one. - if (KnownOne2[BitWidth-1] && ((KnownOne2 & LowBits) != 0)) - KnownOne |= ~LowBits; - assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?"); + if (Known2.One[BitWidth-1] && ((Known2.One & LowBits) != 0)) + Known.One |= ~LowBits; + assert((Known.Zero & Known.One) == 0&&"Bits known to be one AND zero?"); } } break; @@ -2571,41 +2596,37 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, const APInt &RA = Rem->getAPIntValue(); if (RA.isPowerOf2()) { APInt LowBits = (RA - 1); - computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, - Depth + 1); + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); // The upper bits are all zero, the lower ones are unchanged. - KnownZero = KnownZero2 | ~LowBits; - KnownOne = KnownOne2 & LowBits; + Known.Zero = Known2.Zero | ~LowBits; + Known.One = Known2.One & LowBits; break; } } // Since the result is less than or equal to either operand, any leading // zero bits in either operand must also exist in the result. - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, - Depth + 1); - computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts, - Depth + 1); + computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); + computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1); - uint32_t Leaders = std::max(KnownZero.countLeadingOnes(), - KnownZero2.countLeadingOnes()); - KnownOne.clearAllBits(); - KnownZero = APInt::getHighBitsSet(BitWidth, Leaders); + uint32_t Leaders = + std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros()); + Known.resetAll(); + Known.Zero.setHighBits(Leaders); break; } case ISD::EXTRACT_ELEMENT: { - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); + computeKnownBits(Op.getOperand(0), Known, Depth+1); const unsigned Index = Op.getConstantOperandVal(1); const unsigned BitWidth = Op.getValueSizeInBits(); // Remove low part of known bits mask - KnownZero = KnownZero.getHiBits(KnownZero.getBitWidth() - Index * BitWidth); - KnownOne = KnownOne.getHiBits(KnownOne.getBitWidth() - Index * BitWidth); + Known.Zero = Known.Zero.getHiBits(Known.Zero.getBitWidth() - Index * BitWidth); + Known.One = Known.One.getHiBits(Known.One.getBitWidth() - Index * BitWidth); // Remove high part of known bit mask - KnownZero = KnownZero.trunc(BitWidth); - KnownOne = KnownOne.trunc(BitWidth); + Known = Known.trunc(BitWidth); break; } case ISD::EXTRACT_VECTOR_ELT: { @@ -2617,24 +2638,20 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, const unsigned NumSrcElts = VecVT.getVectorNumElements(); // If BitWidth > EltBitWidth the value is anyext:ed. So we do not know // anything about the extended bits. - if (BitWidth > EltBitWidth) { - KnownZero = KnownZero.trunc(EltBitWidth); - KnownOne = KnownOne.trunc(EltBitWidth); - } + if (BitWidth > EltBitWidth) + Known = Known.trunc(EltBitWidth); ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo); if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts)) { // If we know the element index, just demand that vector element. unsigned Idx = ConstEltNo->getZExtValue(); APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx); - computeKnownBits(InVec, KnownZero, KnownOne, DemandedElt, Depth + 1); + computeKnownBits(InVec, Known, DemandedElt, Depth + 1); } else { // Unknown element index, so ignore DemandedElts and demand them all. - computeKnownBits(InVec, KnownZero, KnownOne, Depth + 1); - } - if (BitWidth > EltBitWidth) { - KnownZero = KnownZero.zext(BitWidth); - KnownOne = KnownOne.zext(BitWidth); + computeKnownBits(InVec, Known, Depth + 1); } + if (BitWidth > EltBitWidth) + Known = Known.zext(BitWidth); break; } case ISD::INSERT_VECTOR_ELT: { @@ -2646,60 +2663,110 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) { // If we know the element index, split the demand between the // source vector and the inserted element. - KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth); + Known.Zero = Known.One = APInt::getAllOnesValue(BitWidth); unsigned EltIdx = CEltNo->getZExtValue(); // If we demand the inserted element then add its common known bits. if (DemandedElts[EltIdx]) { - computeKnownBits(InVal, KnownZero2, KnownOne2, Depth + 1); - KnownOne &= KnownOne2.zextOrTrunc(KnownOne.getBitWidth()); - KnownZero &= KnownZero2.zextOrTrunc(KnownZero.getBitWidth());; + computeKnownBits(InVal, Known2, Depth + 1); + Known.One &= Known2.One.zextOrTrunc(Known.One.getBitWidth()); + Known.Zero &= Known2.Zero.zextOrTrunc(Known.Zero.getBitWidth()); } // If we demand the source vector then add its common known bits, ensuring // that we don't demand the inserted element. APInt VectorElts = DemandedElts & ~(APInt::getOneBitSet(NumElts, EltIdx)); if (!!VectorElts) { - computeKnownBits(InVec, KnownZero2, KnownOne2, VectorElts, Depth + 1); - KnownOne &= KnownOne2; - KnownZero &= KnownZero2; + computeKnownBits(InVec, Known2, VectorElts, Depth + 1); + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; } } else { // Unknown element index, so ignore DemandedElts and demand them all. - computeKnownBits(InVec, KnownZero, KnownOne, Depth + 1); - computeKnownBits(InVal, KnownZero2, KnownOne2, Depth + 1); - KnownOne &= KnownOne2.zextOrTrunc(KnownOne.getBitWidth()); - KnownZero &= KnownZero2.zextOrTrunc(KnownZero.getBitWidth());; + computeKnownBits(InVec, Known, Depth + 1); + computeKnownBits(InVal, Known2, Depth + 1); + Known.One &= Known2.One.zextOrTrunc(Known.One.getBitWidth()); + Known.Zero &= Known2.Zero.zextOrTrunc(Known.Zero.getBitWidth()); } break; } + case ISD::BITREVERSE: { + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); + Known.Zero = Known2.Zero.reverseBits(); + Known.One = Known2.One.reverseBits(); + break; + } case ISD::BSWAP: { - computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); + Known.Zero = Known2.Zero.byteSwap(); + Known.One = Known2.One.byteSwap(); + break; + } + case ISD::ABS: { + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); + + // If the source's MSB is zero then we know the rest of the bits already. + if (Known2.isNonNegative()) { + Known.Zero = Known2.Zero; + Known.One = Known2.One; + break; + } + + // We only know that the absolute values's MSB will be zero iff there is + // a set bit that isn't the sign bit (otherwise it could be INT_MIN). + Known2.One.clearSignBit(); + if (Known2.One.getBoolValue()) { + Known.Zero = APInt::getSignMask(BitWidth); + break; + } + break; + } + case ISD::UMIN: { + computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); + computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1); + + // UMIN - we know that the result will have the maximum of the + // known zero leading bits of the inputs. + unsigned LeadZero = Known.countMinLeadingZeros(); + LeadZero = std::max(LeadZero, Known2.countMinLeadingZeros()); + + Known.Zero &= Known2.Zero; + Known.One &= Known2.One; + Known.Zero.setHighBits(LeadZero); + break; + } + case ISD::UMAX: { + computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); - KnownZero = KnownZero2.byteSwap(); - KnownOne = KnownOne2.byteSwap(); + computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1); + + // UMAX - we know that the result will have the maximum of the + // known one leading bits of the inputs. + unsigned LeadOne = Known.countMinLeadingOnes(); + LeadOne = std::max(LeadOne, Known2.countMinLeadingOnes()); + + Known.Zero &= Known2.Zero; + Known.One &= Known2.One; + Known.One.setHighBits(LeadOne); break; } case ISD::SMIN: - case ISD::SMAX: - case ISD::UMIN: - case ISD::UMAX: { - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, + case ISD::SMAX: { + computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); // If we don't know any bits, early out. - if (!KnownOne && !KnownZero) + if (!Known.One && !Known.Zero) break; - computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts, - Depth + 1); - KnownZero &= KnownZero2; - KnownOne &= KnownOne2; + computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1); + Known.Zero &= Known2.Zero; + Known.One &= Known2.One; break; } case ISD::FrameIndex: case ISD::TargetFrameIndex: if (unsigned Align = InferPtrAlignment(Op)) { // The low bits are known zero if the pointer is aligned. - KnownZero = APInt::getLowBitsSet(BitWidth, Log2_32(Align)); + Known.Zero.setLowBits(Log2_32(Align)); break; } break; @@ -2712,11 +2779,45 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, case ISD::INTRINSIC_W_CHAIN: case ISD::INTRINSIC_VOID: // Allow the target to implement this method for its nodes. - TLI->computeKnownBitsForTargetNode(Op, KnownZero, KnownOne, *this, Depth); + TLI->computeKnownBitsForTargetNode(Op, Known, DemandedElts, *this, Depth); break; } - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); + assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?"); +} + +SelectionDAG::OverflowKind SelectionDAG::computeOverflowKind(SDValue N0, + SDValue N1) const { + // X + 0 never overflow + if (isNullConstant(N1)) + return OFK_Never; + + KnownBits N1Known; + computeKnownBits(N1, N1Known); + if (N1Known.Zero.getBoolValue()) { + KnownBits N0Known; + computeKnownBits(N0, N0Known); + + bool overflow; + (void)(~N0Known.Zero).uadd_ov(~N1Known.Zero, overflow); + if (!overflow) + return OFK_Never; + } + + // mulhi + 1 never overflow + if (N0.getOpcode() == ISD::UMUL_LOHI && N0.getResNo() == 1 && + (~N1Known.Zero & 0x01) == ~N1Known.Zero) + return OFK_Never; + + if (N1.getOpcode() == ISD::UMUL_LOHI && N1.getResNo() == 1) { + KnownBits N0Known; + computeKnownBits(N0, N0Known); + + if ((~N0Known.Zero & 0x01) == ~N0Known.Zero) + return OFK_Never; + } + + return OFK_Sometime; } bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const { @@ -2730,7 +2831,7 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const { // A left-shift of a constant one will have exactly one bit set because // shifting the bit off the end is undefined. if (Val.getOpcode() == ISD::SHL) { - auto *C = dyn_cast<ConstantSDNode>(Val.getOperand(0)); + auto *C = isConstOrConstSplat(Val.getOperand(0)); if (C && C->getAPIntValue() == 1) return true; } @@ -2738,14 +2839,14 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const { // Similarly, a logical right-shift of a constant sign-bit will have exactly // one bit set. if (Val.getOpcode() == ISD::SRL) { - auto *C = dyn_cast<ConstantSDNode>(Val.getOperand(0)); - if (C && C->getAPIntValue().isSignBit()) + auto *C = isConstOrConstSplat(Val.getOperand(0)); + if (C && C->getAPIntValue().isSignMask()) return true; } // Are all operands of a build vector constant powers of two? if (Val.getOpcode() == ISD::BUILD_VECTOR) - if (llvm::all_of(Val->ops(), [this, BitWidth](SDValue E) { + if (llvm::all_of(Val->ops(), [BitWidth](SDValue E) { if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(E)) return C->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2(); return false; @@ -2756,22 +2857,34 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const { // to handle some common cases. // Fall back to computeKnownBits to catch other known cases. - APInt KnownZero, KnownOne; - computeKnownBits(Val, KnownZero, KnownOne); - return (KnownZero.countPopulation() == BitWidth - 1) && - (KnownOne.countPopulation() == 1); + KnownBits Known; + computeKnownBits(Val, Known); + return (Known.countMaxPopulation() == 1) && (Known.countMinPopulation() == 1); } unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { EVT VT = Op.getValueType(); + APInt DemandedElts = VT.isVector() + ? APInt::getAllOnesValue(VT.getVectorNumElements()) + : APInt(1, 1); + return ComputeNumSignBits(Op, DemandedElts, Depth); +} + +unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, + unsigned Depth) const { + EVT VT = Op.getValueType(); assert(VT.isInteger() && "Invalid VT!"); unsigned VTBits = VT.getScalarSizeInBits(); + unsigned NumElts = DemandedElts.getBitWidth(); unsigned Tmp, Tmp2; unsigned FirstAnswer = 1; if (Depth == 6) return 1; // Limit search depth. + if (!DemandedElts) + return 1; // No demanded elts, better to assume we don't know anything. + switch (Op.getOpcode()) { default: break; case ISD::AssertSext: @@ -2786,7 +2899,61 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { return Val.getNumSignBits(); } + case ISD::BUILD_VECTOR: + Tmp = VTBits; + for (unsigned i = 0, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i) { + if (!DemandedElts[i]) + continue; + + SDValue SrcOp = Op.getOperand(i); + Tmp2 = ComputeNumSignBits(Op.getOperand(i), Depth + 1); + + // BUILD_VECTOR can implicitly truncate sources, we must handle this. + if (SrcOp.getValueSizeInBits() != VTBits) { + assert(SrcOp.getValueSizeInBits() > VTBits && + "Expected BUILD_VECTOR implicit truncation"); + unsigned ExtraBits = SrcOp.getValueSizeInBits() - VTBits; + Tmp2 = (Tmp2 > ExtraBits ? Tmp2 - ExtraBits : 1); + } + Tmp = std::min(Tmp, Tmp2); + } + return Tmp; + + case ISD::VECTOR_SHUFFLE: { + // Collect the minimum number of sign bits that are shared by every vector + // element referenced by the shuffle. + APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0); + const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op); + assert(NumElts == SVN->getMask().size() && "Unexpected vector size"); + for (unsigned i = 0; i != NumElts; ++i) { + int M = SVN->getMaskElt(i); + if (!DemandedElts[i]) + continue; + // For UNDEF elements, we don't know anything about the common state of + // the shuffle result. + if (M < 0) + return 1; + if ((unsigned)M < NumElts) + DemandedLHS.setBit((unsigned)M % NumElts); + else + DemandedRHS.setBit((unsigned)M % NumElts); + } + Tmp = std::numeric_limits<unsigned>::max(); + if (!!DemandedLHS) + Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1); + if (!!DemandedRHS) { + Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1); + Tmp = std::min(Tmp, Tmp2); + } + // If we don't know anything, early out and try computeKnownBits fall-back. + if (Tmp == 1) + break; + assert(Tmp <= VTBits && "Failed to determine minimum sign bits"); + return Tmp; + } + case ISD::SIGN_EXTEND: + case ISD::SIGN_EXTEND_VECTOR_INREG: Tmp = VTBits - Op.getOperand(0).getScalarValueSizeInBits(); return ComputeNumSignBits(Op.getOperand(0), Depth+1) + Tmp; @@ -2799,7 +2966,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { return std::max(Tmp, Tmp2); case ISD::SRA: - Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1); + Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1); // SRA X, C -> adds C sign bits. if (ConstantSDNode *C = isConstOrConstSplat(Op.getOperand(1))) { APInt ShiftVal = C->getAPIntValue(); @@ -2887,6 +3054,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { } break; case ISD::ADD: + case ISD::ADDC: // Add can have at most one carry bit. Thus we know that the output // is, at worst, one more bit than the inputs. Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1); @@ -2895,17 +3063,17 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { // Special case decrementing a value (ADD X, -1): if (ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) if (CRHS->isAllOnesValue()) { - APInt KnownZero, KnownOne; - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); + KnownBits Known; + computeKnownBits(Op.getOperand(0), Known, Depth+1); // If the input is known to be 0 or 1, the output is 0/-1, which is all // sign bits set. - if ((KnownZero | APInt(VTBits, 1)).isAllOnesValue()) + if ((Known.Zero | 1).isAllOnesValue()) return VTBits; // If we are subtracting one from a positive number, there is no carry // out of the result. - if (KnownZero.isNegative()) + if (Known.isNonNegative()) return Tmp; } @@ -2920,16 +3088,16 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { // Handle NEG. if (ConstantSDNode *CLHS = isConstOrConstSplat(Op.getOperand(0))) if (CLHS->isNullValue()) { - APInt KnownZero, KnownOne; - computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1); + KnownBits Known; + computeKnownBits(Op.getOperand(1), Known, Depth+1); // If the input is known to be 0 or 1, the output is 0/-1, which is all // sign bits set. - if ((KnownZero | APInt(VTBits, 1)).isAllOnesValue()) + if ((Known.Zero | 1).isAllOnesValue()) return VTBits; // If the input is known to be positive (the sign bit is known clear), // the output of the NEG has the same number of sign bits as the input. - if (KnownZero.isNegative()) + if (Known.isNonNegative()) return Tmp2; // Otherwise, we treat this like a SUB. @@ -2961,28 +3129,98 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { // result. Otherwise it gives either negative or > bitwidth result return std::max(std::min(KnownSign - rIndex * BitWidth, BitWidth), 0); } + case ISD::INSERT_VECTOR_ELT: { + SDValue InVec = Op.getOperand(0); + SDValue InVal = Op.getOperand(1); + SDValue EltNo = Op.getOperand(2); + unsigned NumElts = InVec.getValueType().getVectorNumElements(); + + ConstantSDNode *CEltNo = dyn_cast<ConstantSDNode>(EltNo); + if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) { + // If we know the element index, split the demand between the + // source vector and the inserted element. + unsigned EltIdx = CEltNo->getZExtValue(); + + // If we demand the inserted element then get its sign bits. + Tmp = std::numeric_limits<unsigned>::max(); + if (DemandedElts[EltIdx]) { + // TODO - handle implicit truncation of inserted elements. + if (InVal.getScalarValueSizeInBits() != VTBits) + break; + Tmp = ComputeNumSignBits(InVal, Depth + 1); + } + + // If we demand the source vector then get its sign bits, and determine + // the minimum. + APInt VectorElts = DemandedElts; + VectorElts.clearBit(EltIdx); + if (!!VectorElts) { + Tmp2 = ComputeNumSignBits(InVec, VectorElts, Depth + 1); + Tmp = std::min(Tmp, Tmp2); + } + } else { + // Unknown element index, so ignore DemandedElts and demand them all. + Tmp = ComputeNumSignBits(InVec, Depth + 1); + Tmp2 = ComputeNumSignBits(InVal, Depth + 1); + Tmp = std::min(Tmp, Tmp2); + } + assert(Tmp <= VTBits && "Failed to determine minimum sign bits"); + return Tmp; + } case ISD::EXTRACT_VECTOR_ELT: { - // At the moment we keep this simple and skip tracking the specific - // element. This way we get the lowest common denominator for all elements - // of the vector. - // TODO: get information for given vector element + SDValue InVec = Op.getOperand(0); + SDValue EltNo = Op.getOperand(1); + EVT VecVT = InVec.getValueType(); const unsigned BitWidth = Op.getValueSizeInBits(); const unsigned EltBitWidth = Op.getOperand(0).getScalarValueSizeInBits(); + const unsigned NumSrcElts = VecVT.getVectorNumElements(); + // If BitWidth > EltBitWidth the value is anyext:ed, and we do not know // anything about sign bits. But if the sizes match we can derive knowledge // about sign bits from the vector operand. - if (BitWidth == EltBitWidth) - return ComputeNumSignBits(Op.getOperand(0), Depth+1); - break; + if (BitWidth != EltBitWidth) + break; + + // If we know the element index, just demand that vector element, else for + // an unknown element index, ignore DemandedElts and demand them all. + APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts); + ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo); + if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts)) + DemandedSrcElts = + APInt::getOneBitSet(NumSrcElts, ConstEltNo->getZExtValue()); + + return ComputeNumSignBits(InVec, DemandedSrcElts, Depth + 1); + } + case ISD::EXTRACT_SUBVECTOR: { + // If we know the element index, just demand that subvector elements, + // otherwise demand them all. + SDValue Src = Op.getOperand(0); + ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + unsigned NumSrcElts = Src.getValueType().getVectorNumElements(); + if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) { + // Offset the demanded elts by the subvector index. + uint64_t Idx = SubIdx->getZExtValue(); + APInt DemandedSrc = DemandedElts.zext(NumSrcElts).shl(Idx); + return ComputeNumSignBits(Src, DemandedSrc, Depth + 1); + } + return ComputeNumSignBits(Src, Depth + 1); } - case ISD::EXTRACT_SUBVECTOR: - return ComputeNumSignBits(Op.getOperand(0), Depth + 1); case ISD::CONCAT_VECTORS: - // Determine the minimum number of sign bits across all input vectors. - // Early out if the result is already 1. - Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1); - for (unsigned i = 1, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i) - Tmp = std::min(Tmp, ComputeNumSignBits(Op.getOperand(i), Depth + 1)); + // Determine the minimum number of sign bits across all demanded + // elts of the input vectors. Early out if the result is already 1. + Tmp = std::numeric_limits<unsigned>::max(); + EVT SubVectorVT = Op.getOperand(0).getValueType(); + unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements(); + unsigned NumSubVectors = Op.getNumOperands(); + for (unsigned i = 0; (i < NumSubVectors) && (Tmp > 1); ++i) { + APInt DemandedSub = DemandedElts.lshr(i * NumSubVectorElts); + DemandedSub = DemandedSub.trunc(NumSubVectorElts); + if (!DemandedSub) + continue; + Tmp2 = ComputeNumSignBits(Op.getOperand(i), DemandedSub, Depth + 1); + Tmp = std::min(Tmp, Tmp2); + } + assert(Tmp <= VTBits && "Failed to determine minimum sign bits"); return Tmp; } @@ -3008,20 +3246,22 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN || Op.getOpcode() == ISD::INTRINSIC_W_CHAIN || Op.getOpcode() == ISD::INTRINSIC_VOID) { - unsigned NumBits = TLI->ComputeNumSignBitsForTargetNode(Op, *this, Depth); - if (NumBits > 1) FirstAnswer = std::max(FirstAnswer, NumBits); + unsigned NumBits = + TLI->ComputeNumSignBitsForTargetNode(Op, DemandedElts, *this, Depth); + if (NumBits > 1) + FirstAnswer = std::max(FirstAnswer, NumBits); } // Finally, if we can prove that the top bits of the result are 0's or 1's, // use this information. - APInt KnownZero, KnownOne; - computeKnownBits(Op, KnownZero, KnownOne, Depth); + KnownBits Known; + computeKnownBits(Op, Known, DemandedElts, Depth); APInt Mask; - if (KnownZero.isNegative()) { // sign bit is 0 - Mask = KnownZero; - } else if (KnownOne.isNegative()) { // sign bit is 1; - Mask = KnownOne; + if (Known.isNonNegative()) { // sign bit is 0 + Mask = Known.Zero; + } else if (Known.isNegative()) { // sign bit is 1; + Mask = Known.One; } else { // Nothing known. return FirstAnswer; @@ -3054,6 +3294,9 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op) const { if (getTarget().Options.NoNaNsFPMath) return true; + if (Op->getFlags().hasNoNaNs()) + return true; + // If the value is a constant, we can obviously see if it is a NaN or not. if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) return !C->getValueAPF().isNaN(); @@ -3096,16 +3339,15 @@ bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const { bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const { assert(A.getValueType() == B.getValueType() && "Values must have the same type"); - APInt AZero, AOne; - APInt BZero, BOne; - computeKnownBits(A, AZero, AOne); - computeKnownBits(B, BZero, BOne); - return (AZero | BZero).isAllOnesValue(); + KnownBits AKnown, BKnown; + computeKnownBits(A, AKnown); + computeKnownBits(B, BKnown); + return (AKnown.Zero | BKnown.Zero).isAllOnesValue(); } static SDValue FoldCONCAT_VECTORS(const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops, - llvm::SelectionDAG &DAG) { + SelectionDAG &DAG) { assert(!Ops.empty() && "Can't concatenate an empty list of vectors!"); assert(llvm::all_of(Ops, [Ops](SDValue Op) { @@ -3169,7 +3411,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT) { } SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, - SDValue Operand) { + SDValue Operand, const SDNodeFlags Flags) { // Constant fold unary operations with an integer constant operand. Even // opaque constant will be folded, because the folding of unary operations // doesn't create new constants with different values. Nevertheless, the @@ -3206,6 +3448,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, if (VT == MVT::f128 && C->getValueType(0) == MVT::i128) return getConstantFP(APFloat(APFloat::IEEEquad(), Val), DL, VT); break; + case ISD::ABS: + return getConstant(Val.abs(), DL, VT, C->isTargetOpcode(), + C->isOpaque()); + case ISD::BITREVERSE: + return getConstant(Val.reverseBits(), DL, VT, C->isTargetOpcode(), + C->isOpaque()); case ISD::BSWAP: return getConstant(Val.byteSwap(), DL, VT, C->isTargetOpcode(), C->isOpaque()); @@ -3220,6 +3468,17 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, case ISD::CTTZ_ZERO_UNDEF: return getConstant(Val.countTrailingZeros(), DL, VT, C->isTargetOpcode(), C->isOpaque()); + case ISD::FP16_TO_FP: { + bool Ignored; + APFloat FPV(APFloat::IEEEhalf(), + (Val.getBitWidth() == 16) ? Val : Val.trunc(16)); + + // This can return overflow, underflow, or inexact; we don't care. + // FIXME need to be more flexible about rounding mode. + (void)FPV.convert(EVTToAPFloatSemantics(VT), + APFloat::rmNearestTiesToEven, &Ignored); + return getConstantFP(FPV, DL, VT); + } } } @@ -3261,17 +3520,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, } case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: { - integerPart x[2]; bool ignored; - static_assert(integerPartWidth >= 64, "APFloat parts too small!"); + APSInt IntVal(VT.getSizeInBits(), Opcode == ISD::FP_TO_UINT); // FIXME need to be more flexible about rounding mode. - APFloat::opStatus s = V.convertToInteger(x, VT.getSizeInBits(), - Opcode==ISD::FP_TO_SINT, - APFloat::rmTowardZero, &ignored); - if (s==APFloat::opInvalidOp) // inexact is OK, in fact usual + APFloat::opStatus s = + V.convertToInteger(IntVal, APFloat::rmTowardZero, &ignored); + if (s == APFloat::opInvalidOp) // inexact is OK, in fact usual break; - APInt api(VT.getSizeInBits(), x); - return getConstant(api, DL, VT); + return getConstant(IntVal, DL, VT); } case ISD::BITCAST: if (VT == MVT::i16 && C->getValueType(0) == MVT::f16) @@ -3281,6 +3537,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, else if (VT == MVT::i64 && C->getValueType(0) == MVT::f64) return getConstant(V.bitcastToAPInt().getZExtValue(), DL, VT); break; + case ISD::FP_TO_FP16: { + bool Ignored; + // This can return overflow, underflow, or inexact; we don't care. + // FIXME need to be more flexible about rounding mode. + (void)V.convert(APFloat::IEEEhalf(), + APFloat::rmNearestTiesToEven, &Ignored); + return getConstant(V.bitcastToAPInt(), DL, VT); + } } } @@ -3303,6 +3567,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, case ISD::TRUNCATE: case ISD::UINT_TO_FP: case ISD::SINT_TO_FP: + case ISD::ABS: + case ISD::BITREVERSE: case ISD::BSWAP: case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: @@ -3348,7 +3614,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, assert(Operand.getValueType().bitsLT(VT) && "Invalid sext node, dst < src!"); if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND) - return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0)); + return getNode(OpOpcode, DL, VT, Operand.getOperand(0)); else if (OpOpcode == ISD::UNDEF) // sext(undef) = 0, because the top bits will all be the same. return getConstant(0, DL, VT); @@ -3364,8 +3630,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, assert(Operand.getValueType().bitsLT(VT) && "Invalid zext node, dst < src!"); if (OpOpcode == ISD::ZERO_EXTEND) // (zext (zext x)) -> (zext x) - return getNode(ISD::ZERO_EXTEND, DL, VT, - Operand.getNode()->getOperand(0)); + return getNode(ISD::ZERO_EXTEND, DL, VT, Operand.getOperand(0)); else if (OpOpcode == ISD::UNDEF) // zext(undef) = 0, because the top bits will be zero. return getConstant(0, DL, VT); @@ -3384,13 +3649,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ANY_EXTEND) // (ext (zext x)) -> (zext x) and (ext (sext x)) -> (sext x) - return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0)); + return getNode(OpOpcode, DL, VT, Operand.getOperand(0)); else if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); // (ext (trunx x)) -> x if (OpOpcode == ISD::TRUNCATE) { - SDValue OpOp = Operand.getNode()->getOperand(0); + SDValue OpOp = Operand.getOperand(0); if (OpOp.getValueType() == VT) return OpOp; } @@ -3406,20 +3671,26 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, assert(Operand.getValueType().bitsGT(VT) && "Invalid truncate node, src < dst!"); if (OpOpcode == ISD::TRUNCATE) - return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0)); + return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0)); if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ANY_EXTEND) { // If the source is smaller than the dest, we still need an extend. - if (Operand.getNode()->getOperand(0).getValueType().getScalarType() + if (Operand.getOperand(0).getValueType().getScalarType() .bitsLT(VT.getScalarType())) - return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0)); - if (Operand.getNode()->getOperand(0).getValueType().bitsGT(VT)) - return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0)); - return Operand.getNode()->getOperand(0); + return getNode(OpOpcode, DL, VT, Operand.getOperand(0)); + if (Operand.getOperand(0).getValueType().bitsGT(VT)) + return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0)); + return Operand.getOperand(0); } if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); break; + case ISD::ABS: + assert(VT.isInteger() && VT == Operand.getValueType() && + "Invalid ABS!"); + if (OpOpcode == ISD::UNDEF) + return getUNDEF(VT); + break; case ISD::BSWAP: assert(VT.isInteger() && VT == Operand.getValueType() && "Invalid BSWAP!"); @@ -3464,15 +3735,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0 if (getTarget().Options.UnsafeFPMath && OpOpcode == ISD::FSUB) // FIXME: FNEG has no fast-math-flags to propagate; use the FSUB's flags? - return getNode(ISD::FSUB, DL, VT, Operand.getNode()->getOperand(1), - Operand.getNode()->getOperand(0), - &cast<BinaryWithFlagsSDNode>(Operand.getNode())->Flags); + return getNode(ISD::FSUB, DL, VT, Operand.getOperand(1), + Operand.getOperand(0), Operand.getNode()->getFlags()); if (OpOpcode == ISD::FNEG) // --X -> X - return Operand.getNode()->getOperand(0); + return Operand.getOperand(0); break; case ISD::FABS: if (OpOpcode == ISD::FNEG) // abs(-X) -> abs(X) - return getNode(ISD::FABS, DL, VT, Operand.getNode()->getOperand(0)); + return getNode(ISD::FABS, DL, VT, Operand.getOperand(0)); break; } @@ -3483,10 +3753,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTs, Ops); void *IP = nullptr; - if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) + if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) { + E->intersectFlagsWith(Flags); return SDValue(E, 0); + } N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs); + N->setFlags(Flags); createOperands(N, Ops); CSEMap.InsertNode(N, IP); } else { @@ -3569,6 +3842,31 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT, GA->getOffset() + uint64_t(Offset)); } +bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) { + switch (Opcode) { + case ISD::SDIV: + case ISD::UDIV: + case ISD::SREM: + case ISD::UREM: { + // If a divisor is zero/undef or any element of a divisor vector is + // zero/undef, the whole op is undef. + assert(Ops.size() == 2 && "Div/rem should have 2 operands"); + SDValue Divisor = Ops[1]; + if (Divisor.isUndef() || isNullConstant(Divisor)) + return true; + + return ISD::isBuildVectorOfConstantSDNodes(Divisor.getNode()) && + llvm::any_of(Divisor->op_values(), + [](SDValue V) { return V.isUndef() || + isNullConstant(V); }); + // TODO: Handle signed overflow. + } + // TODO: Handle oversized shifts. + default: + return false; + } +} + SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, SDNode *Cst1, SDNode *Cst2) { @@ -3578,6 +3876,9 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, if (Opcode >= ISD::BUILTIN_OP_END) return SDValue(); + if (isUndef(Opcode, {SDValue(Cst1, 0), SDValue(Cst2, 0)})) + return getUNDEF(VT); + // Handle the case of two scalars. if (const ConstantSDNode *Scalar1 = dyn_cast<ConstantSDNode>(Cst1)) { if (const ConstantSDNode *Scalar2 = dyn_cast<ConstantSDNode>(Cst2)) { @@ -3591,7 +3892,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, // fold (add Sym, c) -> Sym+c if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Cst1)) return FoldSymbolOffset(Opcode, VT, GA, Cst2); - if (isCommutativeBinOp(Opcode)) + if (TLI->isCommutativeBinOp(Opcode)) if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Cst2)) return FoldSymbolOffset(Opcode, VT, GA, Cst1); @@ -3638,13 +3939,16 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops, - const SDNodeFlags *Flags) { + const SDNodeFlags Flags) { // If the opcode is a target-specific ISD node, there's nothing we can // do here and the operand rules may not line up with the below, so // bail early. if (Opcode >= ISD::BUILTIN_OP_END) return SDValue(); + if (isUndef(Opcode, Ops)) + return getUNDEF(VT); + // We can only fold vectors - maybe merge with FoldConstantArithmetic someday? if (!VT.isVector()) return SDValue(); @@ -3665,8 +3969,8 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode, // All operands must be vector types with the same number of elements as // the result type and must be either UNDEF or a build vector of constant // or UNDEF scalars. - if (!all_of(Ops, IsConstantBuildVectorOrUndef) || - !all_of(Ops, IsScalarOrSameVectorSize)) + if (!llvm::all_of(Ops, IsConstantBuildVectorOrUndef) || + !llvm::all_of(Ops, IsScalarOrSameVectorSize)) return SDValue(); // If we are comparing vectors, then the result needs to be a i1 boolean @@ -3676,7 +3980,7 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode, // Find legal integer scalar type for constant promotion and // ensure that its scalar size is at least as large as source. EVT LegalSVT = VT.getScalarType(); - if (LegalSVT.isInteger()) { + if (NewNodesMustHaveLegalTypes && LegalSVT.isInteger()) { LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT); if (LegalSVT.bitsLT(VT.getScalarType())) return SDValue(); @@ -3727,15 +4031,14 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode, } SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, - SDValue N1, SDValue N2, - const SDNodeFlags *Flags) { + SDValue N1, SDValue N2, const SDNodeFlags Flags) { ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2); ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2); // Canonicalize constant to RHS if commutative. - if (isCommutativeBinOp(Opcode)) { + if (TLI->isCommutativeBinOp(Opcode)) { if (N1C && !N2C) { std::swap(N1C, N2C); std::swap(N1, N2); @@ -3910,35 +4213,31 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, assert(EVT.bitsLE(VT) && "Not extending!"); if (EVT == VT) return N1; // Not actually extending - auto SignExtendInReg = [&](APInt Val) { + auto SignExtendInReg = [&](APInt Val, llvm::EVT ConstantVT) { unsigned FromBits = EVT.getScalarSizeInBits(); Val <<= Val.getBitWidth() - FromBits; - Val = Val.ashr(Val.getBitWidth() - FromBits); - return getConstant(Val, DL, VT.getScalarType()); + Val.ashrInPlace(Val.getBitWidth() - FromBits); + return getConstant(Val, DL, ConstantVT); }; if (N1C) { const APInt &Val = N1C->getAPIntValue(); - return SignExtendInReg(Val); + return SignExtendInReg(Val, VT); } if (ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) { SmallVector<SDValue, 8> Ops; + llvm::EVT OpVT = N1.getOperand(0).getValueType(); for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) { SDValue Op = N1.getOperand(i); if (Op.isUndef()) { - Ops.push_back(getUNDEF(VT.getScalarType())); + Ops.push_back(getUNDEF(OpVT)); continue; } - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { - APInt Val = C->getAPIntValue(); - Val = Val.zextOrTrunc(VT.getScalarSizeInBits()); - Ops.push_back(SignExtendInReg(Val)); - continue; - } - break; + ConstantSDNode *C = cast<ConstantSDNode>(Op); + APInt Val = C->getAPIntValue(); + Ops.push_back(SignExtendInReg(Val, OpVT)); } - if (Ops.size() == VT.getVectorNumElements()) - return getBuildVector(VT, DL, Ops); + return getBuildVector(VT, DL, Ops); } break; } @@ -4040,6 +4339,19 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, if (VT.getSimpleVT() == N1.getSimpleValueType()) return N1; + // EXTRACT_SUBVECTOR of an UNDEF is an UNDEF. + if (N1.isUndef()) + return getUNDEF(VT); + + // EXTRACT_SUBVECTOR of CONCAT_VECTOR can be simplified if the pieces of + // the concat have the same type as the extract. + if (N2C && N1.getOpcode() == ISD::CONCAT_VECTORS && + N1.getNumOperands() > 0 && + VT == N1.getOperand(0).getValueType()) { + unsigned Factor = VT.getVectorNumElements(); + return N1.getOperand(N2C->getZExtValue() / Factor); + } + // EXTRACT_SUBVECTOR of INSERT_SUBVECTOR is often created // during shuffle legalization. if (N1.getOpcode() == ISD::INSERT_SUBVECTOR && N2 == N1.getOperand(2) && @@ -4110,7 +4422,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, // Canonicalize an UNDEF to the RHS, even over a constant. if (N1.isUndef()) { - if (isCommutativeBinOp(Opcode)) { + if (TLI->isCommutativeBinOp(Opcode)) { std::swap(N1, N2); } else { switch (Opcode) { @@ -4186,21 +4498,23 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, // Memoize this node if possible. SDNode *N; SDVTList VTs = getVTList(VT); + SDValue Ops[] = {N1, N2}; if (VT != MVT::Glue) { - SDValue Ops[] = {N1, N2}; FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTs, Ops); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) { - if (Flags) - E->intersectFlagsWith(Flags); + E->intersectFlagsWith(Flags); return SDValue(E, 0); } - N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, Flags); + N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs); + N->setFlags(Flags); + createOperands(N, Ops); CSEMap.InsertNode(N, IP); } else { - N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, Flags); + N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs); + createOperands(N, Ops); } InsertNode(N); @@ -4392,9 +4706,10 @@ static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG, /// used when a memcpy is turned into a memset when the source is a constant /// string ptr. static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG, - const TargetLowering &TLI, StringRef Str) { + const TargetLowering &TLI, + const ConstantDataArraySlice &Slice) { // Handle vector with all elements zero. - if (Str.empty()) { + if (Slice.Array == nullptr) { if (VT.isInteger()) return DAG.getConstant(0, dl, VT); else if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128) @@ -4413,15 +4728,15 @@ static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG, assert(!VT.isVector() && "Can't handle vector type here!"); unsigned NumVTBits = VT.getSizeInBits(); unsigned NumVTBytes = NumVTBits / 8; - unsigned NumBytes = std::min(NumVTBytes, unsigned(Str.size())); + unsigned NumBytes = std::min(NumVTBytes, unsigned(Slice.Length)); APInt Val(NumVTBits, 0); if (DAG.getDataLayout().isLittleEndian()) { for (unsigned i = 0; i != NumBytes; ++i) - Val |= (uint64_t)(unsigned char)Str[i] << i*8; + Val |= (uint64_t)(unsigned char)Slice[i] << i*8; } else { for (unsigned i = 0; i != NumBytes; ++i) - Val |= (uint64_t)(unsigned char)Str[i] << (NumVTBytes-i-1)*8; + Val |= (uint64_t)(unsigned char)Slice[i] << (NumVTBytes-i-1)*8; } // If the "cost" of materializing the integer immediate is less than the cost @@ -4438,9 +4753,8 @@ SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, unsigned Offset, return getNode(ISD::ADD, DL, VT, Base, getConstant(Offset, DL, VT)); } -/// isMemSrcFromString - Returns true if memcpy source is a string constant. -/// -static bool isMemSrcFromString(SDValue Src, StringRef &Str) { +/// Returns true if memcpy source is constant data. +static bool isMemSrcFromConstant(SDValue Src, ConstantDataArraySlice &Slice) { uint64_t SrcDelta = 0; GlobalAddressSDNode *G = nullptr; if (Src.getOpcode() == ISD::GlobalAddress) @@ -4454,8 +4768,8 @@ static bool isMemSrcFromString(SDValue Src, StringRef &Str) { if (!G) return false; - return getConstantStringInfo(G->getGlobal(), Str, - SrcDelta + G->getOffset(), false); + return getConstantDataArrayInfo(G->getGlobal(), Slice, 8, + SrcDelta + G->getOffset()); } /// Determines the optimal series of memory ops to replace the memset / memcpy. @@ -4486,23 +4800,23 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps, DAG.getMachineFunction()); if (VT == MVT::Other) { - if (DstAlign >= DAG.getDataLayout().getPointerPrefAlignment(DstAS) || - TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign)) { - VT = TLI.getPointerTy(DAG.getDataLayout(), DstAS); - } else { - switch (DstAlign & 7) { - case 0: VT = MVT::i64; break; - case 4: VT = MVT::i32; break; - case 2: VT = MVT::i16; break; - default: VT = MVT::i8; break; - } - } - + // Use the largest integer type whose alignment constraints are satisfied. + // We only need to check DstAlign here as SrcAlign is always greater or + // equal to DstAlign (or zero). + VT = MVT::i64; + while (DstAlign && DstAlign < VT.getSizeInBits() / 8 && + !TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign)) + VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1); + assert(VT.isInteger()); + + // Find the largest legal integer type. MVT LVT = MVT::i64; while (!TLI.isTypeLegal(LVT)) LVT = (MVT::SimpleValueType)(LVT.SimpleTy - 1); assert(LVT.isInteger()); + // If the type we've chosen is larger than the largest legal integer type + // then use that instead. if (VT.bitsGT(LVT)) VT = LVT; } @@ -4587,6 +4901,8 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, // TODO: In the AlwaysInline case, if the size is big then generate a loop // rather than maybe a humongous number of loads and stores. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + const DataLayout &DL = DAG.getDataLayout(); + LLVMContext &C = *DAG.getContext(); std::vector<EVT> MemOps; bool DstAlignCanChange = false; MachineFunction &MF = DAG.getMachineFunction(); @@ -4598,30 +4914,30 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, unsigned SrcAlign = DAG.InferPtrAlignment(Src); if (Align > SrcAlign) SrcAlign = Align; - StringRef Str; - bool CopyFromStr = isMemSrcFromString(Src, Str); - bool isZeroStr = CopyFromStr && Str.empty(); + ConstantDataArraySlice Slice; + bool CopyFromConstant = isMemSrcFromConstant(Src, Slice); + bool isZeroConstant = CopyFromConstant && Slice.Array == nullptr; unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy(OptSize); if (!FindOptimalMemOpLowering(MemOps, Limit, Size, (DstAlignCanChange ? 0 : Align), - (isZeroStr ? 0 : SrcAlign), - false, false, CopyFromStr, true, + (isZeroConstant ? 0 : SrcAlign), + false, false, CopyFromConstant, true, DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(), DAG, TLI)) return SDValue(); if (DstAlignCanChange) { - Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext()); - unsigned NewAlign = (unsigned)DAG.getDataLayout().getABITypeAlignment(Ty); + Type *Ty = MemOps[0].getTypeForEVT(C); + unsigned NewAlign = (unsigned)DL.getABITypeAlignment(Ty); // Don't promote to an alignment that would require dynamic stack // realignment. const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); if (!TRI->needsStackRealignment(MF)) while (NewAlign > Align && - DAG.getDataLayout().exceedsNaturalStackAlignment(NewAlign)) + DL.exceedsNaturalStackAlignment(NewAlign)) NewAlign /= 2; if (NewAlign > Align) { @@ -4650,18 +4966,29 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, DstOff -= VTSize - Size; } - if (CopyFromStr && - (isZeroStr || (VT.isInteger() && !VT.isVector()))) { + if (CopyFromConstant && + (isZeroConstant || (VT.isInteger() && !VT.isVector()))) { // It's unlikely a store of a vector immediate can be done in a single // instruction. It would require a load from a constantpool first. // We only handle zero vectors here. // FIXME: Handle other cases where store of vector immediate is done in // a single instruction. - Value = getMemsetStringVal(VT, dl, DAG, TLI, Str.substr(SrcOff)); + ConstantDataArraySlice SubSlice; + if (SrcOff < Slice.Length) { + SubSlice = Slice; + SubSlice.move(SrcOff); + } else { + // This is an out-of-bounds access and hence UB. Pretend we read zero. + SubSlice.Array = nullptr; + SubSlice.Offset = 0; + SubSlice.Length = VTSize; + } + Value = getMemsetStringVal(VT, dl, DAG, TLI, SubSlice); if (Value.getNode()) Store = DAG.getStore(Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl), - DstPtrInfo.getWithOffset(DstOff), Align, MMOFlags); + DstPtrInfo.getWithOffset(DstOff), Align, + MMOFlags); } if (!Store.getNode()) { @@ -4670,12 +4997,19 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, // thing to do is generate a LoadExt/StoreTrunc pair. These simplify // to Load/Store if NVT==VT. // FIXME does the case above also need this? - EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + EVT NVT = TLI.getTypeToTransformTo(C, VT); assert(NVT.bitsGE(VT)); + + bool isDereferenceable = + SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL); + MachineMemOperand::Flags SrcMMOFlags = MMOFlags; + if (isDereferenceable) + SrcMMOFlags |= MachineMemOperand::MODereferenceable; + Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain, DAG.getMemBasePlusOffset(Src, SrcOff, dl), SrcPtrInfo.getWithOffset(SrcOff), VT, - MinAlign(SrcAlign, SrcOff), MMOFlags); + MinAlign(SrcAlign, SrcOff), SrcMMOFlags); OutChains.push_back(Value.getValue(1)); Store = DAG.getTruncStore( Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl), @@ -4703,6 +5037,8 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, // Expand memmove to a series of load and store ops if the size operand falls // below a certain threshold. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + const DataLayout &DL = DAG.getDataLayout(); + LLVMContext &C = *DAG.getContext(); std::vector<EVT> MemOps; bool DstAlignCanChange = false; MachineFunction &MF = DAG.getMachineFunction(); @@ -4725,8 +5061,8 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, return SDValue(); if (DstAlignCanChange) { - Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext()); - unsigned NewAlign = (unsigned)DAG.getDataLayout().getABITypeAlignment(Ty); + Type *Ty = MemOps[0].getTypeForEVT(C); + unsigned NewAlign = (unsigned)DL.getABITypeAlignment(Ty); if (NewAlign > Align) { // Give the stack frame object a larger alignment if needed. if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign) @@ -4747,9 +5083,15 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, unsigned VTSize = VT.getSizeInBits() / 8; SDValue Value; + bool isDereferenceable = + SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL); + MachineMemOperand::Flags SrcMMOFlags = MMOFlags; + if (isDereferenceable) + SrcMMOFlags |= MachineMemOperand::MODereferenceable; + Value = DAG.getLoad(VT, dl, Chain, DAG.getMemBasePlusOffset(Src, SrcOff, dl), - SrcPtrInfo.getWithOffset(SrcOff), SrcAlign, MMOFlags); + SrcPtrInfo.getWithOffset(SrcOff), SrcAlign, SrcMMOFlags); LoadValues.push_back(Value); LoadChains.push_back(Value.getValue(1)); SrcOff += VTSize; @@ -4943,11 +5285,11 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, TargetLowering::CallLoweringInfo CLI(*this); CLI.setDebugLoc(dl) .setChain(Chain) - .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY), - Dst.getValueType().getTypeForEVT(*getContext()), - getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY), - TLI->getPointerTy(getDataLayout())), - std::move(Args)) + .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY), + Dst.getValueType().getTypeForEVT(*getContext()), + getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY), + TLI->getPointerTy(getDataLayout())), + std::move(Args)) .setDiscardResult() .setTailCall(isTailCall); @@ -5004,11 +5346,11 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst, TargetLowering::CallLoweringInfo CLI(*this); CLI.setDebugLoc(dl) .setChain(Chain) - .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE), - Dst.getValueType().getTypeForEVT(*getContext()), - getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE), - TLI->getPointerTy(getDataLayout())), - std::move(Args)) + .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE), + Dst.getValueType().getTypeForEVT(*getContext()), + getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE), + TLI->getPointerTy(getDataLayout())), + std::move(Args)) .setDiscardResult() .setTailCall(isTailCall); @@ -5066,11 +5408,11 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst, TargetLowering::CallLoweringInfo CLI(*this); CLI.setDebugLoc(dl) .setChain(Chain) - .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET), - Dst.getValueType().getTypeForEVT(*getContext()), - getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET), - TLI->getPointerTy(getDataLayout())), - std::move(Args)) + .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET), + Dst.getValueType().getTypeForEVT(*getContext()), + getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET), + TLI->getPointerTy(getDataLayout())), + std::move(Args)) .setDiscardResult() .setTailCall(isTailCall); @@ -5104,7 +5446,7 @@ SDValue SelectionDAG::getAtomicCmpSwap( unsigned Opcode, const SDLoc &dl, EVT MemVT, SDVTList VTs, SDValue Chain, SDValue Ptr, SDValue Cmp, SDValue Swp, MachinePointerInfo PtrInfo, unsigned Alignment, AtomicOrdering SuccessOrdering, - AtomicOrdering FailureOrdering, SynchronizationScope SynchScope) { + AtomicOrdering FailureOrdering, SyncScope::ID SSID) { assert(Opcode == ISD::ATOMIC_CMP_SWAP || Opcode == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types"); @@ -5120,7 +5462,7 @@ SDValue SelectionDAG::getAtomicCmpSwap( MachineMemOperand::MOStore; MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Alignment, - AAMDNodes(), nullptr, SynchScope, SuccessOrdering, + AAMDNodes(), nullptr, SSID, SuccessOrdering, FailureOrdering); return getAtomicCmpSwap(Opcode, dl, MemVT, VTs, Chain, Ptr, Cmp, Swp, MMO); @@ -5142,7 +5484,7 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, const Value *PtrVal, unsigned Alignment, AtomicOrdering Ordering, - SynchronizationScope SynchScope) { + SyncScope::ID SSID) { if (Alignment == 0) // Ensure that codegen never sees alignment 0 Alignment = getEVTAlignment(MemVT); @@ -5162,7 +5504,7 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo(PtrVal), Flags, MemVT.getStoreSize(), Alignment, AAMDNodes(), - nullptr, SynchScope, Ordering); + nullptr, SSID, Ordering); return getAtomic(Opcode, dl, MemVT, Chain, Ptr, Val, MMO); } @@ -5246,7 +5588,7 @@ SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, Opcode == ISD::PREFETCH || Opcode == ISD::LIFETIME_START || Opcode == ISD::LIFETIME_END || - (Opcode <= INT_MAX && + ((int)Opcode <= std::numeric_limits<int>::max() && (int)Opcode >= ISD::FIRST_TARGET_MEMORY_OPCODE)) && "Opcode is not a memory-accessing opcode!"); @@ -5580,7 +5922,6 @@ SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::LoadExtType ExtTy, bool isExpanding) { - SDVTList VTs = getVTList(VT, MVT::Other); SDValue Ops[] = { Chain, Ptr, Mask, Src0 }; FoldingSetNodeID ID; @@ -5722,11 +6063,11 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, } SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, - ArrayRef<SDValue> Ops, const SDNodeFlags *Flags) { + ArrayRef<SDValue> Ops, const SDNodeFlags Flags) { unsigned NumOps = Ops.size(); switch (NumOps) { case 0: return getNode(Opcode, DL, VT); - case 1: return getNode(Opcode, DL, VT, Ops[0]); + case 1: return getNode(Opcode, DL, VT, Ops[0], Flags); case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Flags); case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]); default: break; @@ -5734,13 +6075,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, switch (Opcode) { default: break; - case ISD::CONCAT_VECTORS: { + case ISD::CONCAT_VECTORS: // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF. if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this)) return V; break; - } - case ISD::SELECT_CC: { + case ISD::SELECT_CC: assert(NumOps == 5 && "SELECT_CC takes 5 operands!"); assert(Ops[0].getValueType() == Ops[1].getValueType() && "LHS and RHS of condition must have same type!"); @@ -5749,14 +6089,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, assert(Ops[2].getValueType() == VT && "select_cc node must be of same type as true and false value!"); break; - } - case ISD::BR_CC: { + case ISD::BR_CC: assert(NumOps == 5 && "BR_CC takes 5 operands!"); assert(Ops[2].getValueType() == Ops[3].getValueType() && "LHS/RHS of comparison should match types!"); break; } - } // Memoize nodes. SDNode *N; @@ -6238,6 +6576,62 @@ SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc, return N; } +SDNode* SelectionDAG::mutateStrictFPToFP(SDNode *Node) { + unsigned OrigOpc = Node->getOpcode(); + unsigned NewOpc; + bool IsUnary = false; + switch (OrigOpc) { + default: + llvm_unreachable("mutateStrictFPToFP called with unexpected opcode!"); + case ISD::STRICT_FADD: NewOpc = ISD::FADD; break; + case ISD::STRICT_FSUB: NewOpc = ISD::FSUB; break; + case ISD::STRICT_FMUL: NewOpc = ISD::FMUL; break; + case ISD::STRICT_FDIV: NewOpc = ISD::FDIV; break; + case ISD::STRICT_FREM: NewOpc = ISD::FREM; break; + case ISD::STRICT_FSQRT: NewOpc = ISD::FSQRT; IsUnary = true; break; + case ISD::STRICT_FPOW: NewOpc = ISD::FPOW; break; + case ISD::STRICT_FPOWI: NewOpc = ISD::FPOWI; break; + case ISD::STRICT_FSIN: NewOpc = ISD::FSIN; IsUnary = true; break; + case ISD::STRICT_FCOS: NewOpc = ISD::FCOS; IsUnary = true; break; + case ISD::STRICT_FEXP: NewOpc = ISD::FEXP; IsUnary = true; break; + case ISD::STRICT_FEXP2: NewOpc = ISD::FEXP2; IsUnary = true; break; + case ISD::STRICT_FLOG: NewOpc = ISD::FLOG; IsUnary = true; break; + case ISD::STRICT_FLOG10: NewOpc = ISD::FLOG10; IsUnary = true; break; + case ISD::STRICT_FLOG2: NewOpc = ISD::FLOG2; IsUnary = true; break; + case ISD::STRICT_FRINT: NewOpc = ISD::FRINT; IsUnary = true; break; + case ISD::STRICT_FNEARBYINT: + NewOpc = ISD::FNEARBYINT; + IsUnary = true; + break; + } + + // We're taking this node out of the chain, so we need to re-link things. + SDValue InputChain = Node->getOperand(0); + SDValue OutputChain = SDValue(Node, 1); + ReplaceAllUsesOfValueWith(OutputChain, InputChain); + + SDVTList VTs = getVTList(Node->getOperand(1).getValueType()); + SDNode *Res = nullptr; + if (IsUnary) + Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1) }); + else + Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1), + Node->getOperand(2) }); + + // MorphNodeTo can operate in two ways: if an existing node with the + // specified operands exists, it can just return it. Otherwise, it + // updates the node in place to have the requested operands. + if (Res == Node) { + // If we updated the node in place, reset the node ID. To the isel, + // this should be just like a newly allocated machine node. + Res->setNodeId(-1); + } else { + ReplaceAllUsesWith(Node, Res); + RemoveDeadNode(Node); + } + + return Res; +} /// getMachineNode - These are used for target selectors to create a new node /// with specified return type(s), MachineInstr opcode, and operands. @@ -6384,14 +6778,13 @@ SDValue SelectionDAG::getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, /// else return NULL. SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef<SDValue> Ops, - const SDNodeFlags *Flags) { + const SDNodeFlags Flags) { if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) { FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTList, Ops); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, SDLoc(), IP)) { - if (Flags) - E->intersectFlagsWith(Flags); + E->intersectFlagsWith(Flags); return E; } } @@ -6452,7 +6845,7 @@ public: : SelectionDAG::DAGUpdateListener(d), UI(ui), UE(ue) {} }; -} +} // end anonymous namespace /// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead. /// This can cause recursive merging of nodes in the DAG. @@ -6498,7 +6891,6 @@ void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To) { AddModifiedNodeToCSEMaps(User); } - // If we just RAUW'd the root, take note. if (FromN == getRoot()) setRoot(To); @@ -6668,6 +7060,7 @@ void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){ } namespace { + /// UseMemo - This class is used by SelectionDAG::ReplaceAllUsesOfValuesWith /// to record information about a use. struct UseMemo { @@ -6680,7 +7073,8 @@ namespace { bool operator<(const UseMemo &L, const UseMemo &R) { return (intptr_t)L.User < (intptr_t)R.User; } -} + +} // end anonymous namespace /// ReplaceAllUsesOfValuesWith - Replace any uses of From with To, leaving /// uses of other values produced by From.getNode() alone. The same value @@ -6746,7 +7140,6 @@ void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From, /// based on their topological order. It returns the maximum id and a vector /// of the SDNodes* in assigned order by reference. unsigned SelectionDAG::AssignTopologicalOrder() { - unsigned DAGSize = 0; // SortedPos tracks the progress of the algorithm. Nodes before it are @@ -6872,6 +7265,25 @@ void SelectionDAG::TransferDbgValues(SDValue From, SDValue To) { AddDbgValue(I, ToNode, false); } +SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, + SDValue NewMemOp) { + assert(isa<MemSDNode>(NewMemOp.getNode()) && "Expected a memop node"); + // The new memory operation must have the same position as the old load in + // terms of memory dependency. Create a TokenFactor for the old load and new + // memory operation and update uses of the old load's output chain to use that + // TokenFactor. + SDValue OldChain = SDValue(OldLoad, 1); + SDValue NewChain = SDValue(NewMemOp.getNode(), 1); + if (!OldLoad->hasAnyUseOfValue(1)) + return NewChain; + + SDValue TokenFactor = + getNode(ISD::TokenFactor, SDLoc(OldLoad), MVT::Other, OldChain, NewChain); + ReplaceAllUsesOfValueWith(OldChain, TokenFactor); + UpdateNodeOperands(TokenFactor.getNode(), OldChain, NewChain); + return TokenFactor; +} + //===----------------------------------------------------------------------===// // SDNode Class //===----------------------------------------------------------------------===// @@ -6973,6 +7385,7 @@ void SDNode::Profile(FoldingSetNodeID &ID) const { } namespace { + struct EVTArray { std::vector<EVT> VTs; @@ -6982,11 +7395,12 @@ namespace { VTs.push_back(MVT((MVT::SimpleValueType)i)); } }; -} -static ManagedStatic<std::set<EVT, EVT::compareRawBits> > EVTs; +} // end anonymous namespace + +static ManagedStatic<std::set<EVT, EVT::compareRawBits>> EVTs; static ManagedStatic<EVTArray> SimpleVTArray; -static ManagedStatic<sys::SmartMutex<true> > VTMutex; +static ManagedStatic<sys::SmartMutex<true>> VTMutex; /// getValueTypeList - Return a pointer to the specified value type. /// @@ -7020,7 +7434,6 @@ bool SDNode::hasNUsesOfValue(unsigned NUses, unsigned Value) const { return NUses == 0; } - /// hasAnyUseOfValue - Return true if there are any use of the indicated /// value. This method ignores uses of other values defined by this operation. bool SDNode::hasAnyUseOfValue(unsigned Value) const { @@ -7033,9 +7446,7 @@ bool SDNode::hasAnyUseOfValue(unsigned Value) const { return false; } - /// isOnlyUserOf - Return true if this node is the only use of N. -/// bool SDNode::isOnlyUserOf(const SDNode *N) const { bool Seen = false; for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) { @@ -7049,8 +7460,22 @@ bool SDNode::isOnlyUserOf(const SDNode *N) const { return Seen; } +/// Return true if the only users of N are contained in Nodes. +bool SDNode::areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N) { + bool Seen = false; + for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) { + SDNode *User = *I; + if (llvm::any_of(Nodes, + [&User](const SDNode *Node) { return User == Node; })) + Seen = true; + else + return false; + } + + return Seen; +} + /// isOperand - Return true if this node is an operand of N. -/// bool SDValue::isOperandOf(const SDNode *N) const { for (const SDValue &Op : N->op_values()) if (*this == Op) @@ -7070,21 +7495,39 @@ bool SDNode::isOperandOf(const SDNode *N) const { /// side-effecting instructions on any chain path. In practice, this looks /// through token factors and non-volatile loads. In order to remain efficient, /// this only looks a couple of nodes in, it does not do an exhaustive search. +/// +/// Note that we only need to examine chains when we're searching for +/// side-effects; SelectionDAG requires that all side-effects are represented +/// by chains, even if another operand would force a specific ordering. This +/// constraint is necessary to allow transformations like splitting loads. bool SDValue::reachesChainWithoutSideEffects(SDValue Dest, - unsigned Depth) const { + unsigned Depth) const { if (*this == Dest) return true; // Don't search too deeply, we just want to be able to see through // TokenFactor's etc. if (Depth == 0) return false; - // If this is a token factor, all inputs to the TF happen in parallel. If any - // of the operands of the TF does not reach dest, then we cannot do the xform. + // If this is a token factor, all inputs to the TF happen in parallel. if (getOpcode() == ISD::TokenFactor) { - for (unsigned i = 0, e = getNumOperands(); i != e; ++i) - if (!getOperand(i).reachesChainWithoutSideEffects(Dest, Depth-1)) - return false; - return true; + // First, try a shallow search. + if (is_contained((*this)->ops(), Dest)) { + // We found the chain we want as an operand of this TokenFactor. + // Essentially, we reach the chain without side-effects if we could + // serialize the TokenFactor into a simple chain of operations with + // Dest as the last operation. This is automatically true if the + // chain has one use: there are no other ordering constraints. + // If the chain has more than one use, we give up: some other + // use of Dest might force a side-effect between Dest and the current + // node. + if (Dest.hasOneUse()) + return true; + } + // Next, try a deep search: check whether every operand of the TokenFactor + // reaches Dest. + return llvm::all_of((*this)->ops(), [=](SDValue Op) { + return Op.reachesChainWithoutSideEffects(Dest, Depth - 1); + }); } // Loads don't have side effects, look through them. @@ -7102,20 +7545,8 @@ bool SDNode::hasPredecessor(const SDNode *N) const { return hasPredecessorHelper(N, Visited, Worklist); } -uint64_t SDNode::getConstantOperandVal(unsigned Num) const { - assert(Num < NumOperands && "Invalid child # of SDNode!"); - return cast<ConstantSDNode>(OperandList[Num])->getZExtValue(); -} - -const SDNodeFlags *SDNode::getFlags() const { - if (auto *FlagsNode = dyn_cast<BinaryWithFlagsSDNode>(this)) - return &FlagsNode->Flags; - return nullptr; -} - -void SDNode::intersectFlagsWith(const SDNodeFlags *Flags) { - if (auto *FlagsNode = dyn_cast<BinaryWithFlagsSDNode>(this)) - FlagsNode->Flags.intersectWith(Flags); +void SDNode::intersectFlagsWith(const SDNodeFlags Flags) { + this->Flags.intersectWith(Flags); } SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) { @@ -7204,49 +7635,16 @@ bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD, SDValue Loc = LD->getOperand(1); SDValue BaseLoc = Base->getOperand(1); - if (Loc.getOpcode() == ISD::FrameIndex) { - if (BaseLoc.getOpcode() != ISD::FrameIndex) - return false; - const MachineFrameInfo &MFI = getMachineFunction().getFrameInfo(); - int FI = cast<FrameIndexSDNode>(Loc)->getIndex(); - int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex(); - int FS = MFI.getObjectSize(FI); - int BFS = MFI.getObjectSize(BFI); - if (FS != BFS || FS != (int)Bytes) return false; - return MFI.getObjectOffset(FI) == (MFI.getObjectOffset(BFI) + Dist*Bytes); - } - - // Handle X + C. - if (isBaseWithConstantOffset(Loc)) { - int64_t LocOffset = cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue(); - if (Loc.getOperand(0) == BaseLoc) { - // If the base location is a simple address with no offset itself, then - // the second load's first add operand should be the base address. - if (LocOffset == Dist * (int)Bytes) - return true; - } else if (isBaseWithConstantOffset(BaseLoc)) { - // The base location itself has an offset, so subtract that value from the - // second load's offset before comparing to distance * size. - int64_t BOffset = - cast<ConstantSDNode>(BaseLoc.getOperand(1))->getSExtValue(); - if (Loc.getOperand(0) == BaseLoc.getOperand(0)) { - if ((LocOffset - BOffset) == Dist * (int)Bytes) - return true; - } - } - } - const GlobalValue *GV1 = nullptr; - const GlobalValue *GV2 = nullptr; - int64_t Offset1 = 0; - int64_t Offset2 = 0; - bool isGA1 = TLI->isGAPlusOffset(Loc.getNode(), GV1, Offset1); - bool isGA2 = TLI->isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2); - if (isGA1 && isGA2 && GV1 == GV2) - return Offset1 == (Offset2 + Dist*Bytes); + + auto BaseLocDecomp = BaseIndexOffset::match(BaseLoc, *this); + auto LocDecomp = BaseIndexOffset::match(Loc, *this); + + int64_t Offset = 0; + if (BaseLocDecomp.equalBaseIndex(LocDecomp, *this, Offset)) + return (Dist * Bytes == Offset); return false; } - /// InferPtrAlignment - Infer alignment of a load / store address. Return 0 if /// it cannot be inferred. unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const { @@ -7255,10 +7653,9 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const { int64_t GVOffset = 0; if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) { unsigned PtrWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType()); - APInt KnownZero(PtrWidth, 0), KnownOne(PtrWidth, 0); - llvm::computeKnownBits(const_cast<GlobalValue *>(GV), KnownZero, KnownOne, - getDataLayout()); - unsigned AlignBits = KnownZero.countTrailingOnes(); + KnownBits Known(PtrWidth); + llvm::computeKnownBits(GV, Known, getDataLayout()); + unsigned AlignBits = Known.countMinTrailingZeros(); unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0; if (Align) return MinAlign(Align, GVOffset); @@ -7292,14 +7689,11 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const { std::pair<EVT, EVT> SelectionDAG::GetSplitDestVTs(const EVT &VT) const { // Currently all types are split in half. EVT LoVT, HiVT; - if (!VT.isVector()) { + if (!VT.isVector()) LoVT = HiVT = TLI->getTypeToTransformTo(*getContext(), VT); - } else { - unsigned NumElements = VT.getVectorNumElements(); - assert(!(NumElements & 1) && "Splitting vector, but not in half!"); - LoVT = HiVT = EVT::getVectorVT(*getContext(), VT.getVectorElementType(), - NumElements/2); - } + else + LoVT = HiVT = VT.getHalfNumVectorElementsVT(*getContext()); + return std::make_pair(LoVT, HiVT); } @@ -7341,59 +7735,58 @@ unsigned GlobalAddressSDNode::getAddressSpace() const { return getGlobal()->getType()->getAddressSpace(); } - Type *ConstantPoolSDNode::getType() const { if (isMachineConstantPoolEntry()) return Val.MachineCPVal->getType(); return Val.ConstVal->getType(); } -bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, - APInt &SplatUndef, +bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits, - bool isBigEndian) const { + bool IsBigEndian) const { EVT VT = getValueType(0); assert(VT.isVector() && "Expected a vector type"); - unsigned sz = VT.getSizeInBits(); - if (MinSplatBits > sz) + unsigned VecWidth = VT.getSizeInBits(); + if (MinSplatBits > VecWidth) return false; - SplatValue = APInt(sz, 0); - SplatUndef = APInt(sz, 0); + // FIXME: The widths are based on this node's type, but build vectors can + // truncate their operands. + SplatValue = APInt(VecWidth, 0); + SplatUndef = APInt(VecWidth, 0); - // Get the bits. Bits with undefined values (when the corresponding element + // Get the bits. Bits with undefined values (when the corresponding element // of the vector is an ISD::UNDEF value) are set in SplatUndef and cleared - // in SplatValue. If any of the values are not constant, give up and return + // in SplatValue. If any of the values are not constant, give up and return // false. - unsigned int nOps = getNumOperands(); - assert(nOps > 0 && "isConstantSplat has 0-size build vector"); - unsigned EltBitSize = VT.getScalarSizeInBits(); + unsigned int NumOps = getNumOperands(); + assert(NumOps > 0 && "isConstantSplat has 0-size build vector"); + unsigned EltWidth = VT.getScalarSizeInBits(); - for (unsigned j = 0; j < nOps; ++j) { - unsigned i = isBigEndian ? nOps-1-j : j; + for (unsigned j = 0; j < NumOps; ++j) { + unsigned i = IsBigEndian ? NumOps - 1 - j : j; SDValue OpVal = getOperand(i); - unsigned BitPos = j * EltBitSize; + unsigned BitPos = j * EltWidth; if (OpVal.isUndef()) - SplatUndef |= APInt::getBitsSet(sz, BitPos, BitPos + EltBitSize); - else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) - SplatValue |= CN->getAPIntValue().zextOrTrunc(EltBitSize). - zextOrTrunc(sz) << BitPos; - else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) - SplatValue |= CN->getValueAPF().bitcastToAPInt().zextOrTrunc(sz) <<BitPos; - else + SplatUndef.setBits(BitPos, BitPos + EltWidth); + else if (auto *CN = dyn_cast<ConstantSDNode>(OpVal)) + SplatValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltWidth), BitPos); + else if (auto *CN = dyn_cast<ConstantFPSDNode>(OpVal)) + SplatValue.insertBits(CN->getValueAPF().bitcastToAPInt(), BitPos); + else return false; } - // The build_vector is all constants or undefs. Find the smallest element + // The build_vector is all constants or undefs. Find the smallest element // size that splats the vector. - HasAnyUndefs = (SplatUndef != 0); - while (sz > 8) { - unsigned HalfSize = sz / 2; + // FIXME: This does not work for vectors with elements less than 8 bits. + while (VecWidth > 8) { + unsigned HalfSize = VecWidth / 2; APInt HighValue = SplatValue.lshr(HalfSize).trunc(HalfSize); APInt LowValue = SplatValue.trunc(HalfSize); APInt HighUndef = SplatUndef.lshr(HalfSize).trunc(HalfSize); @@ -7407,10 +7800,10 @@ bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, SplatValue = HighValue | LowValue; SplatUndef = HighUndef & LowUndef; - sz = HalfSize; + VecWidth = HalfSize; } - SplatBitSize = sz; + SplatBitSize = VecWidth; return true; } diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp new file mode 100644 index 0000000..0d69441 --- /dev/null +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp @@ -0,0 +1,115 @@ +//===-- llvm/CodeGen/SelectionDAGAddressAnalysis.cpp ------- DAG Address +//Analysis ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// + +#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" +#include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" + +namespace llvm { + +bool BaseIndexOffset::equalBaseIndex(BaseIndexOffset &Other, + const SelectionDAG &DAG, int64_t &Off) { + // Initial Offset difference. + Off = Other.Offset - Offset; + + if ((Other.Index == Index) && (Other.IsIndexSignExt == IsIndexSignExt)) { + // Trivial match. + if (Other.Base == Base) + return true; + + // Match GlobalAddresses + if (auto *A = dyn_cast<GlobalAddressSDNode>(Base)) + if (auto *B = dyn_cast<GlobalAddressSDNode>(Other.Base)) + if (A->getGlobal() == B->getGlobal()) { + Off += B->getOffset() - A->getOffset(); + return true; + } + + const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + + // Match non-equal FrameIndexes - If both frame indices are fixed + // we know their relative offsets and can compare them. Otherwise + // we must be conservative. + if (auto *A = dyn_cast<FrameIndexSDNode>(Base)) + if (auto *B = dyn_cast<FrameIndexSDNode>(Other.Base)) + if (MFI.isFixedObjectIndex(A->getIndex()) && + MFI.isFixedObjectIndex(B->getIndex())) { + Off += MFI.getObjectOffset(B->getIndex()) - + MFI.getObjectOffset(A->getIndex()); + return true; + } + } + return false; +} + +/// Parses tree in Ptr for base, index, offset addresses. +BaseIndexOffset BaseIndexOffset::match(SDValue Ptr, const SelectionDAG &DAG) { + // (((B + I*M) + c)) + c ... + SDValue Base = Ptr; + SDValue Index = SDValue(); + int64_t Offset = 0; + bool IsIndexSignExt = false; + + // Consume constant adds & ors with appropriate masking. + while (Base->getOpcode() == ISD::ADD || Base->getOpcode() == ISD::OR) { + if (auto *C = dyn_cast<ConstantSDNode>(Base->getOperand(1))) { + // Only consider ORs which act as adds. + if (Base->getOpcode() == ISD::OR && + !DAG.MaskedValueIsZero(Base->getOperand(0), C->getAPIntValue())) + break; + Offset += C->getSExtValue(); + Base = Base->getOperand(0); + continue; + } + break; + } + + if (Base->getOpcode() == ISD::ADD) { + // TODO: The following code appears to be needless as it just + // bails on some Ptrs early, reducing the cases where we + // find equivalence. We should be able to remove this. + // Inside a loop the current BASE pointer is calculated using an ADD and a + // MUL instruction. In this case Base is the actual BASE pointer. + // (i64 add (i64 %array_ptr) + // (i64 mul (i64 %induction_var) + // (i64 %element_size))) + if (Base->getOperand(1)->getOpcode() == ISD::MUL) + return BaseIndexOffset(Base, Index, Offset, IsIndexSignExt); + + // Look at Base + Index + Offset cases. + Index = Base->getOperand(1); + SDValue PotentialBase = Base->getOperand(0); + + // Skip signextends. + if (Index->getOpcode() == ISD::SIGN_EXTEND) { + Index = Index->getOperand(0); + IsIndexSignExt = true; + } + + // Check if Index Offset pattern + if (Index->getOpcode() != ISD::ADD || + !isa<ConstantSDNode>(Index->getOperand(1))) + return BaseIndexOffset(PotentialBase, Index, Offset, IsIndexSignExt); + + Offset += cast<ConstantSDNode>(Index->getOperand(1))->getSExtValue(); + Index = Index->getOperand(0); + if (Index->getOpcode() == ISD::SIGN_EXTEND) { + Index = Index->getOperand(0); + IsIndexSignExt = true; + } else + IsIndexSignExt = false; + Base = PotentialBase; + } + return BaseIndexOffset(Base, Index, Offset, IsIndexSignExt); +} +} // end namespace llvm diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 996c95b..1273120 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -83,24 +83,6 @@ LimitFPPrecision("limit-float-precision", "for some float libcalls"), cl::location(LimitFloatPrecision), cl::init(0)); - -static cl::opt<bool> -EnableFMFInDAG("enable-fmf-dag", cl::init(true), cl::Hidden, - cl::desc("Enable fast-math-flags for DAG nodes")); - -/// Minimum jump table density for normal functions. -static cl::opt<unsigned> -JumpTableDensity("jump-table-density", cl::init(10), cl::Hidden, - cl::desc("Minimum density for building a jump table in " - "a normal function")); - -/// Minimum jump table density for -Os or -Oz functions. -static cl::opt<unsigned> -OptsizeJumpTableDensity("optsize-jump-table-density", cl::init(40), cl::Hidden, - cl::desc("Minimum density for building a jump table in " - "an optsize function")); - - // Limit the width of DAG chains. This is important in general to prevent // DAG-based analysis from blowing up. For example, alias analysis and // load clustering may not complete in reasonable time. It is difficult to @@ -117,9 +99,31 @@ OptsizeJumpTableDensity("optsize-jump-table-density", cl::init(40), cl::Hidden, // store [4096 x i8] %data, [4096 x i8]* %buffer static const unsigned MaxParallelChains = 64; +// True if the Value passed requires ABI mangling as it is a parameter to a +// function or a return value from a function which is not an intrinsic. +static bool isABIRegCopy(const Value * V) { + const bool IsRetInst = V && isa<ReturnInst>(V); + const bool IsCallInst = V && isa<CallInst>(V); + const bool IsInLineAsm = + IsCallInst && static_cast<const CallInst *>(V)->isInlineAsm(); + const bool IsIndirectFunctionCall = + IsCallInst && !IsInLineAsm && + !static_cast<const CallInst *>(V)->getCalledFunction(); + // It is possible that the call instruction is an inline asm statement or an + // indirect function call in which case the return value of + // getCalledFunction() would be nullptr. + const bool IsInstrinsicCall = + IsCallInst && !IsInLineAsm && !IsIndirectFunctionCall && + static_cast<const CallInst *>(V)->getCalledFunction()->getIntrinsicID() != + Intrinsic::not_intrinsic; + + return IsRetInst || (IsCallInst && (!IsInLineAsm && !IsInstrinsicCall)); +} + static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, - MVT PartVT, EVT ValueVT, const Value *V); + MVT PartVT, EVT ValueVT, const Value *V, + bool IsABIRegCopy); /// getCopyFromParts - Create a value that contains the specified legal parts /// combined into the value they represent. If the parts combine to a type @@ -129,10 +133,11 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, static SDValue getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, MVT PartVT, EVT ValueVT, const Value *V, - Optional<ISD::NodeType> AssertOp = None) { + Optional<ISD::NodeType> AssertOp = None, + bool IsABIRegCopy = false) { if (ValueVT.isVector()) return getCopyFromPartsVector(DAG, DL, Parts, NumParts, - PartVT, ValueVT, V); + PartVT, ValueVT, V, IsABIRegCopy); assert(NumParts > 0 && "No parts to assemble!"); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -276,7 +281,8 @@ static void diagnosePossiblyInvalidConstraint(LLVMContext &Ctx, const Value *V, /// ValueVT (ISD::AssertSext). static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, - MVT PartVT, EVT ValueVT, const Value *V) { + MVT PartVT, EVT ValueVT, const Value *V, + bool IsABIRegCopy) { assert(ValueVT.isVector() && "Not a vector value"); assert(NumParts > 0 && "No parts to assemble!"); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -287,9 +293,18 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, EVT IntermediateVT; MVT RegisterVT; unsigned NumIntermediates; - unsigned NumRegs = - TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT, - NumIntermediates, RegisterVT); + unsigned NumRegs; + + if (IsABIRegCopy) { + NumRegs = TLI.getVectorTypeBreakdownForCallingConv( + *DAG.getContext(), ValueVT, IntermediateVT, NumIntermediates, + RegisterVT); + } else { + NumRegs = + TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT, + NumIntermediates, RegisterVT); + } + assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!"); NumParts = NumRegs; // Silence a compiler warning. assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!"); @@ -318,9 +333,14 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, // Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the // intermediate operands. + EVT BuiltVectorTy = + EVT::getVectorVT(*DAG.getContext(), IntermediateVT.getScalarType(), + (IntermediateVT.isVector() + ? IntermediateVT.getVectorNumElements() * NumParts + : NumIntermediates)); Val = DAG.getNode(IntermediateVT.isVector() ? ISD::CONCAT_VECTORS : ISD::BUILD_VECTOR, - DL, ValueVT, Ops); + DL, BuiltVectorTy, Ops); } // There is now one part, held in Val. Correct it to match ValueVT. @@ -359,23 +379,40 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, TLI.isTypeLegal(ValueVT)) return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); - // Handle cases such as i8 -> <1 x i1> if (ValueVT.getVectorNumElements() != 1) { - diagnosePossiblyInvalidConstraint(*DAG.getContext(), V, - "non-trivial scalar-to-vector conversion"); - return DAG.getUNDEF(ValueVT); + // Certain ABIs require that vectors are passed as integers. For vectors + // are the same size, this is an obvious bitcast. + if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits()) { + return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); + } else if (ValueVT.getSizeInBits() < PartEVT.getSizeInBits()) { + // Bitcast Val back the original type and extract the corresponding + // vector we want. + unsigned Elts = PartEVT.getSizeInBits() / ValueVT.getScalarSizeInBits(); + EVT WiderVecType = EVT::getVectorVT(*DAG.getContext(), + ValueVT.getVectorElementType(), Elts); + Val = DAG.getBitcast(WiderVecType, Val); + return DAG.getNode( + ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val, + DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))); + } + + diagnosePossiblyInvalidConstraint( + *DAG.getContext(), V, "non-trivial scalar-to-vector conversion"); + return DAG.getUNDEF(ValueVT); } - if (ValueVT.getVectorNumElements() == 1 && - ValueVT.getVectorElementType() != PartEVT) - Val = DAG.getAnyExtOrTrunc(Val, DL, ValueVT.getScalarType()); + // Handle cases such as i8 -> <1 x i1> + EVT ValueSVT = ValueVT.getVectorElementType(); + if (ValueVT.getVectorNumElements() == 1 && ValueSVT != PartEVT) + Val = ValueVT.isFloatingPoint() ? DAG.getFPExtendOrRound(Val, DL, ValueSVT) + : DAG.getAnyExtOrTrunc(Val, DL, ValueSVT); - return DAG.getNode(ISD::BUILD_VECTOR, DL, ValueVT, Val); + return DAG.getBuildVector(ValueVT, DL, Val); } static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &dl, SDValue Val, SDValue *Parts, unsigned NumParts, - MVT PartVT, const Value *V); + MVT PartVT, const Value *V, bool IsABIRegCopy); /// getCopyToParts - Create a series of nodes that contain the specified value /// split into legal parts. If the parts contain more bits than Val, then, for @@ -383,12 +420,14 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &dl, static void getCopyToParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, unsigned NumParts, MVT PartVT, const Value *V, - ISD::NodeType ExtendKind = ISD::ANY_EXTEND) { + ISD::NodeType ExtendKind = ISD::ANY_EXTEND, + bool IsABIRegCopy = false) { EVT ValueVT = Val.getValueType(); // Handle the vector case separately. if (ValueVT.isVector()) - return getCopyToPartsVector(DAG, DL, Val, Parts, NumParts, PartVT, V); + return getCopyToPartsVector(DAG, DL, Val, Parts, NumParts, PartVT, V, + IsABIRegCopy); unsigned PartBits = PartVT.getSizeInBits(); unsigned OrigNumParts = NumParts; @@ -513,7 +552,9 @@ static void getCopyToParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val, /// value split into legal parts. static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, unsigned NumParts, - MVT PartVT, const Value *V) { + MVT PartVT, const Value *V, + bool IsABIRegCopy) { + EVT ValueVT = Val.getValueType(); assert(ValueVT.isVector() && "Not a vector"); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -541,7 +582,7 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL, e = PartVT.getVectorNumElements(); i != e; ++i) Ops.push_back(DAG.getUNDEF(ElementVT)); - Val = DAG.getNode(ISD::BUILD_VECTOR, DL, PartVT, Ops); + Val = DAG.getBuildVector(PartVT, DL, Ops); // FIXME: Use CONCAT for 2x -> 4x. @@ -554,17 +595,23 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL, // Promoted vector extract Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT); - } else{ - // Vector -> scalar conversion. - assert(ValueVT.getVectorNumElements() == 1 && - "Only trivial vector-to-scalar conversions should get here!"); - Val = DAG.getNode( - ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val, - DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))); + } else { + if (ValueVT.getVectorNumElements() == 1) { + Val = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val, + DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))); - Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT); + } else { + assert(PartVT.getSizeInBits() > ValueVT.getSizeInBits() && + "lossy conversion of vector to scalar type"); + EVT IntermediateType = + EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits()); + Val = DAG.getBitcast(IntermediateType, Val); + Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT); + } } + assert(Val.getValueType() == PartVT && "Unexpected vector part value type"); Parts[0] = Val; return; } @@ -573,15 +620,31 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL, EVT IntermediateVT; MVT RegisterVT; unsigned NumIntermediates; - unsigned NumRegs = TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, - IntermediateVT, - NumIntermediates, RegisterVT); + unsigned NumRegs; + if (IsABIRegCopy) { + NumRegs = TLI.getVectorTypeBreakdownForCallingConv( + *DAG.getContext(), ValueVT, IntermediateVT, NumIntermediates, + RegisterVT); + } else { + NumRegs = + TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT, + NumIntermediates, RegisterVT); + } unsigned NumElements = ValueVT.getVectorNumElements(); assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!"); NumParts = NumRegs; // Silence a compiler warning. assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!"); + // Convert the vector to the appropiate type if necessary. + unsigned DestVectorNoElts = + NumIntermediates * + (IntermediateVT.isVector() ? IntermediateVT.getVectorNumElements() : 1); + EVT BuiltVectorTy = EVT::getVectorVT( + *DAG.getContext(), IntermediateVT.getScalarType(), DestVectorNoElts); + if (Val.getValueType() != BuiltVectorTy) + Val = DAG.getNode(ISD::BITCAST, DL, BuiltVectorTy, Val); + // Split the vector into intermediate operands. SmallVector<SDValue, 8> Ops(NumIntermediates); for (unsigned i = 0; i != NumIntermediates; ++i) { @@ -614,30 +677,35 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL, } } -RegsForValue::RegsForValue() {} +RegsForValue::RegsForValue() { IsABIMangled = false; } RegsForValue::RegsForValue(const SmallVector<unsigned, 4> ®s, MVT regvt, - EVT valuevt) - : ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs) {} + EVT valuevt, bool IsABIMangledValue) + : ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs), + RegCount(1, regs.size()), IsABIMangled(IsABIMangledValue) {} RegsForValue::RegsForValue(LLVMContext &Context, const TargetLowering &TLI, - const DataLayout &DL, unsigned Reg, Type *Ty) { + const DataLayout &DL, unsigned Reg, Type *Ty, + bool IsABIMangledValue) { ComputeValueVTs(TLI, DL, Ty, ValueVTs); + IsABIMangled = IsABIMangledValue; + for (EVT ValueVT : ValueVTs) { - unsigned NumRegs = TLI.getNumRegisters(Context, ValueVT); - MVT RegisterVT = TLI.getRegisterType(Context, ValueVT); + unsigned NumRegs = IsABIMangledValue + ? TLI.getNumRegistersForCallingConv(Context, ValueVT) + : TLI.getNumRegisters(Context, ValueVT); + MVT RegisterVT = IsABIMangledValue + ? TLI.getRegisterTypeForCallingConv(Context, ValueVT) + : TLI.getRegisterType(Context, ValueVT); for (unsigned i = 0; i != NumRegs; ++i) Regs.push_back(Reg + i); RegVTs.push_back(RegisterVT); + RegCount.push_back(NumRegs); Reg += NumRegs; } } -/// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from -/// this value and returns the result as a ValueVT value. This uses -/// Chain/Flag as the input and updates them for the output Chain/Flag. -/// If the Flag pointer is NULL, no flag is used. SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo, const SDLoc &dl, SDValue &Chain, @@ -654,8 +722,10 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG, for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) { // Copy the legal parts from the registers. EVT ValueVT = ValueVTs[Value]; - unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), ValueVT); - MVT RegisterVT = RegVTs[Value]; + unsigned NumRegs = RegCount[Value]; + MVT RegisterVT = IsABIMangled + ? TLI.getRegisterTypeForCallingConv(RegVTs[Value]) + : RegVTs[Value]; Parts.resize(NumRegs); for (unsigned i = 0; i != NumRegs; ++i) { @@ -683,7 +753,7 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG, unsigned RegSize = RegisterVT.getSizeInBits(); unsigned NumSignBits = LOI->NumSignBits; - unsigned NumZeroBits = LOI->KnownZero.countLeadingOnes(); + unsigned NumZeroBits = LOI->Known.countMinLeadingZeros(); if (NumZeroBits == RegSize) { // The current value is a zero. @@ -739,10 +809,6 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG, return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(ValueVTs), Values); } -/// getCopyToRegs - Emit a series of CopyToReg nodes that copies the -/// specified value into the registers specified by this object. This uses -/// Chain/Flag as the input and updates them for the output Chain/Flag. -/// If the Flag pointer is NULL, no flag is used. void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, const SDLoc &dl, SDValue &Chain, SDValue *Flag, const Value *V, @@ -754,9 +820,11 @@ void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, unsigned NumRegs = Regs.size(); SmallVector<SDValue, 8> Parts(NumRegs); for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) { - EVT ValueVT = ValueVTs[Value]; - unsigned NumParts = TLI.getNumRegisters(*DAG.getContext(), ValueVT); - MVT RegisterVT = RegVTs[Value]; + unsigned NumParts = RegCount[Value]; + + MVT RegisterVT = IsABIMangled + ? TLI.getRegisterTypeForCallingConv(RegVTs[Value]) + : RegVTs[Value]; if (ExtendKind == ISD::ANY_EXTEND && TLI.isZExtFree(Val, RegisterVT)) ExtendKind = ISD::ZERO_EXTEND; @@ -796,9 +864,6 @@ void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); } -/// AddInlineAsmOperands - Add this value to the specified inlineasm node -/// operand list. This adds the code marker and includes the number of -/// values added into it. void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching, unsigned MatchingIdx, const SDLoc &dl, SelectionDAG &DAG, @@ -840,9 +905,9 @@ void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching, } } -void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa, +void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis *aa, const TargetLibraryInfo *li) { - AA = &aa; + AA = aa; GFI = gfi; LibInfo = li; DL = &DAG.getDataLayout(); @@ -850,12 +915,6 @@ void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa, LPadToCallSiteMap.clear(); } -/// clear - Clear out the current SelectionDAG and the associated -/// state and prepare this SelectionDAGBuilder object to be used -/// for a new block. This doesn't clear out information about -/// additional blocks that are needed to complete switch lowering -/// or PHI node updating; that information is cleared out as it is -/// consumed. void SelectionDAGBuilder::clear() { NodeMap.clear(); UnusedArgNodeMap.clear(); @@ -867,21 +926,10 @@ void SelectionDAGBuilder::clear() { StatepointLowering.clear(); } -/// clearDanglingDebugInfo - Clear the dangling debug information -/// map. This function is separated from the clear so that debug -/// information that is dangling in a basic block can be properly -/// resolved in a different basic block. This allows the -/// SelectionDAG to resolve dangling debug information attached -/// to PHI nodes. void SelectionDAGBuilder::clearDanglingDebugInfo() { DanglingDebugInfoMap.clear(); } -/// getRoot - Return the current virtual root of the Selection DAG, -/// flushing any PendingLoad items. This must be done before emitting -/// a store or any other node that may need to be ordered after any -/// prior load instructions. -/// SDValue SelectionDAGBuilder::getRoot() { if (PendingLoads.empty()) return DAG.getRoot(); @@ -901,10 +949,6 @@ SDValue SelectionDAGBuilder::getRoot() { return Root; } -/// getControlRoot - Similar to getRoot, but instead of flushing all the -/// PendingLoad items, flush all the PendingExports items. It is necessary -/// to do this before emitting a terminator instruction. -/// SDValue SelectionDAGBuilder::getControlRoot() { SDValue Root = DAG.getRoot(); @@ -937,7 +981,9 @@ void SelectionDAGBuilder::visit(const Instruction &I) { HandlePHINodesInSuccessorBlocks(I.getParent()); } - ++SDNodeOrder; + // Increase the SDNodeOrder if dealing with a non-debug instruction. + if (!isa<DbgInfoIntrinsic>(I)) + ++SDNodeOrder; CurInst = &I; @@ -1001,10 +1047,12 @@ SDValue SelectionDAGBuilder::getCopyFromRegs(const Value *V, Type *Ty) { if (It != FuncInfo.ValueMap.end()) { unsigned InReg = It->second; + RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(), - DAG.getDataLayout(), InReg, Ty); + DAG.getDataLayout(), InReg, Ty, isABIRegCopy(V)); SDValue Chain = DAG.getEntryNode(); - Result = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V); + Result = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, + V); resolveDanglingDebugInfo(V, Result); } @@ -1122,8 +1170,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { if (isa<ArrayType>(CDS->getType())) return DAG.getMergeValues(Ops, getCurSDLoc()); - return NodeMap[V] = DAG.getNode(ISD::BUILD_VECTOR, getCurSDLoc(), - VT, Ops); + return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops); } if (C->getType()->isStructTy() || C->getType()->isArrayTy()) { @@ -1175,7 +1222,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { } // Create a BUILD_VECTOR node. - return NodeMap[V] = DAG.getNode(ISD::BUILD_VECTOR, getCurSDLoc(), VT, Ops); + return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops); } // If this is a static alloca, generate it as the frameindex instead of @@ -1185,14 +1232,15 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { FuncInfo.StaticAllocaMap.find(AI); if (SI != FuncInfo.StaticAllocaMap.end()) return DAG.getFrameIndex(SI->second, - TLI.getPointerTy(DAG.getDataLayout())); + TLI.getFrameIndexTy(DAG.getDataLayout())); } // If this is an instruction which fast-isel has deferred, select it now. if (const Instruction *Inst = dyn_cast<Instruction>(V)) { unsigned InReg = FuncInfo.InitializeRegForValue(Inst); + RegsForValue RFV(*DAG.getContext(), TLI, DAG.getDataLayout(), InReg, - Inst->getType()); + Inst->getType(), isABIRegCopy(V)); SDValue Chain = DAG.getEntryNode(); return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V); } @@ -1384,7 +1432,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) { RetPtr.getValueType(), RetPtr, DAG.getIntPtrConstant(Offsets[i], getCurSDLoc()), - &Flags); + Flags); Chains[i] = DAG.getStore(Chain, getCurSDLoc(), SDValue(RetOp.getNode(), RetOp.getResNo() + i), // FIXME: better loc info would be nice. @@ -1403,16 +1451,16 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) { const Function *F = I.getParent()->getParent(); ISD::NodeType ExtendKind = ISD::ANY_EXTEND; - if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, + if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Attribute::SExt)) ExtendKind = ISD::SIGN_EXTEND; - else if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, + else if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt)) ExtendKind = ISD::ZERO_EXTEND; LLVMContext &Context = F->getContext(); - bool RetInReg = F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, - Attribute::InReg); + bool RetInReg = F->getAttributes().hasAttribute( + AttributeList::ReturnIndex, Attribute::InReg); for (unsigned j = 0; j != NumValues; ++j) { EVT VT = ValueVTs[j]; @@ -1420,12 +1468,12 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) { if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger()) VT = TLI.getTypeForExtReturn(Context, VT, ExtendKind); - unsigned NumParts = TLI.getNumRegisters(Context, VT); - MVT PartVT = TLI.getRegisterType(Context, VT); + unsigned NumParts = TLI.getNumRegistersForCallingConv(Context, VT); + MVT PartVT = TLI.getRegisterTypeForCallingConv(Context, VT); SmallVector<SDValue, 4> Parts(NumParts); getCopyToParts(DAG, getCurSDLoc(), SDValue(RetOp.getNode(), RetOp.getResNo() + j), - &Parts[0], NumParts, PartVT, &I, ExtendKind); + &Parts[0], NumParts, PartVT, &I, ExtendKind, true); // 'inreg' on function refers to return value ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy(); @@ -1461,9 +1509,10 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) { true /*isfixed*/, 1 /*origidx*/, 0 /*partOffs*/)); // Create SDNode for the swifterror virtual register. - OutVals.push_back(DAG.getRegister(FuncInfo.getOrCreateSwiftErrorVReg( - FuncInfo.MBB, FuncInfo.SwiftErrorArg), - EVT(TLI.getPointerTy(DL)))); + OutVals.push_back( + DAG.getRegister(FuncInfo.getOrCreateSwiftErrorVRegUseAt( + &I, FuncInfo.MBB, FuncInfo.SwiftErrorArg).first, + EVT(TLI.getPointerTy(DL)))); } bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg(); @@ -1582,7 +1631,8 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond, MachineBasicBlock *CurBB, MachineBasicBlock *SwitchBB, BranchProbability TProb, - BranchProbability FProb) { + BranchProbability FProb, + bool InvertCond) { const BasicBlock *BB = CurBB->getBasicBlock(); // If the leaf of the tree is a comparison, merge the condition into @@ -1596,10 +1646,14 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond, isExportableFromCurrentBlock(BOp->getOperand(1), BB))) { ISD::CondCode Condition; if (const ICmpInst *IC = dyn_cast<ICmpInst>(Cond)) { - Condition = getICmpCondCode(IC->getPredicate()); + ICmpInst::Predicate Pred = + InvertCond ? IC->getInversePredicate() : IC->getPredicate(); + Condition = getICmpCondCode(Pred); } else { const FCmpInst *FC = cast<FCmpInst>(Cond); - Condition = getFCmpCondCode(FC->getPredicate()); + FCmpInst::Predicate Pred = + InvertCond ? FC->getInversePredicate() : FC->getPredicate(); + Condition = getFCmpCondCode(Pred); if (TM.Options.NoNaNsFPMath) Condition = getFCmpCodeWithoutNaN(Condition); } @@ -1612,7 +1666,8 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond, } // Create a CaseBlock record representing this branch. - CaseBlock CB(ISD::SETEQ, Cond, ConstantInt::getTrue(*DAG.getContext()), + ISD::CondCode Opc = InvertCond ? ISD::SETNE : ISD::SETEQ; + CaseBlock CB(Opc, Cond, ConstantInt::getTrue(*DAG.getContext()), nullptr, TBB, FBB, CurBB, TProb, FProb); SwitchCases.push_back(CB); } @@ -1625,16 +1680,44 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond, MachineBasicBlock *SwitchBB, Instruction::BinaryOps Opc, BranchProbability TProb, - BranchProbability FProb) { - // If this node is not part of the or/and tree, emit it as a branch. + BranchProbability FProb, + bool InvertCond) { + // Skip over not part of the tree and remember to invert op and operands at + // next level. + if (BinaryOperator::isNot(Cond) && Cond->hasOneUse()) { + const Value *CondOp = BinaryOperator::getNotArgument(Cond); + if (InBlock(CondOp, CurBB->getBasicBlock())) { + FindMergedConditions(CondOp, TBB, FBB, CurBB, SwitchBB, Opc, TProb, FProb, + !InvertCond); + return; + } + } + const Instruction *BOp = dyn_cast<Instruction>(Cond); + // Compute the effective opcode for Cond, taking into account whether it needs + // to be inverted, e.g. + // and (not (or A, B)), C + // gets lowered as + // and (and (not A, not B), C) + unsigned BOpc = 0; + if (BOp) { + BOpc = BOp->getOpcode(); + if (InvertCond) { + if (BOpc == Instruction::And) + BOpc = Instruction::Or; + else if (BOpc == Instruction::Or) + BOpc = Instruction::And; + } + } + + // If this node is not part of the or/and tree, emit it as a branch. if (!BOp || !(isa<BinaryOperator>(BOp) || isa<CmpInst>(BOp)) || - (unsigned)BOp->getOpcode() != Opc || !BOp->hasOneUse() || + BOpc != Opc || !BOp->hasOneUse() || BOp->getParent() != CurBB->getBasicBlock() || !InBlock(BOp->getOperand(0), CurBB->getBasicBlock()) || !InBlock(BOp->getOperand(1), CurBB->getBasicBlock())) { EmitBranchForMergedCondition(Cond, TBB, FBB, CurBB, SwitchBB, - TProb, FProb); + TProb, FProb, InvertCond); return; } @@ -1669,14 +1752,14 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond, auto NewFalseProb = TProb / 2 + FProb; // Emit the LHS condition. FindMergedConditions(BOp->getOperand(0), TBB, TmpBB, CurBB, SwitchBB, Opc, - NewTrueProb, NewFalseProb); + NewTrueProb, NewFalseProb, InvertCond); // Normalize A/2 and B to get A/(1+B) and 2B/(1+B). SmallVector<BranchProbability, 2> Probs{TProb / 2, FProb}; BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end()); // Emit the RHS condition into TmpBB. FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc, - Probs[0], Probs[1]); + Probs[0], Probs[1], InvertCond); } else { assert(Opc == Instruction::And && "Unknown merge op!"); // Codegen X & Y as: @@ -1702,14 +1785,14 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond, auto NewFalseProb = FProb / 2; // Emit the LHS condition. FindMergedConditions(BOp->getOperand(0), TmpBB, FBB, CurBB, SwitchBB, Opc, - NewTrueProb, NewFalseProb); + NewTrueProb, NewFalseProb, InvertCond); // Normalize A and B/2 to get 2A/(1+A) and B/(1+A). SmallVector<BranchProbability, 2> Probs{TProb, FProb / 2}; BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end()); // Emit the RHS condition into TmpBB. FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc, - Probs[0], Probs[1]); + Probs[0], Probs[1], InvertCond); } } @@ -1793,7 +1876,8 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) { FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB, Opcode, getEdgeProbability(BrMBB, Succ0MBB), - getEdgeProbability(BrMBB, Succ1MBB)); + getEdgeProbability(BrMBB, Succ1MBB), + /*InvertCond=*/false); // If the compares in later blocks need to use values not currently // exported from this block, export them now. This block should always // be the first entry. @@ -2027,7 +2111,7 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD, Entry.Node = StackSlot; Entry.Ty = FnTy->getParamType(0); if (Fn->hasAttribute(1, Attribute::AttrKind::InReg)) - Entry.isInReg = true; + Entry.IsInReg = true; Args.push_back(Entry); TargetLowering::CallLoweringInfo CLI(DAG); @@ -2581,15 +2665,15 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) { Flags.setNoSignedWrap(nsw); Flags.setNoUnsignedWrap(nuw); Flags.setVectorReduction(vec_redux); - if (EnableFMFInDAG) { - Flags.setAllowReciprocal(FMF.allowReciprocal()); - Flags.setNoInfs(FMF.noInfs()); - Flags.setNoNaNs(FMF.noNaNs()); - Flags.setNoSignedZeros(FMF.noSignedZeros()); - Flags.setUnsafeAlgebra(FMF.unsafeAlgebra()); - } + Flags.setAllowReciprocal(FMF.allowReciprocal()); + Flags.setAllowContract(FMF.allowContract()); + Flags.setNoInfs(FMF.noInfs()); + Flags.setNoNaNs(FMF.noNaNs()); + Flags.setNoSignedZeros(FMF.noSignedZeros()); + Flags.setUnsafeAlgebra(FMF.unsafeAlgebra()); + SDValue BinNodeValue = DAG.getNode(OpCode, getCurSDLoc(), Op1.getValueType(), - Op1, Op2, &Flags); + Op1, Op2, Flags); setValue(&I, BinNodeValue); } @@ -2642,7 +2726,7 @@ void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) { Flags.setNoSignedWrap(nsw); Flags.setNoUnsignedWrap(nuw); SDValue Res = DAG.getNode(Opcode, getCurSDLoc(), Op1.getValueType(), Op1, Op2, - &Flags); + Flags); setValue(&I, Res); } @@ -2654,7 +2738,7 @@ void SelectionDAGBuilder::visitSDiv(const User &I) { Flags.setExact(isa<PossiblyExactOperator>(&I) && cast<PossiblyExactOperator>(&I)->isExact()); setValue(&I, DAG.getNode(ISD::SDIV, getCurSDLoc(), Op1.getValueType(), Op1, - Op2, &Flags)); + Op2, Flags)); } void SelectionDAGBuilder::visitICmp(const User &I) { @@ -2914,7 +2998,7 @@ void SelectionDAGBuilder::visitBitCast(const User &I) { DestVT, N)); // convert types. // Check if the original LLVM IR Operand was a ConstantInt, because getValue() // might fold any kind of constant expression to an integer constant and that - // is not what we are looking for. Only regcognize a bitcast of a genuine + // is not what we are looking for. Only recognize a bitcast of a genuine // constant integer as an opaque constant. else if(ConstantInt *C = dyn_cast<ConstantInt>(I.getOperand(0))) setValue(&I, DAG.getConstant(C->getValue(), dl, DestVT, /*isTarget=*/false, @@ -3067,14 +3151,10 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) { if (SrcNumElts > MaskNumElts) { // Analyze the access pattern of the vector to see if we can extract - // two subvectors and do the shuffle. The analysis is done by calculating - // the range of elements the mask access on both vectors. - int MinRange[2] = { static_cast<int>(SrcNumElts), - static_cast<int>(SrcNumElts)}; - int MaxRange[2] = {-1, -1}; - - for (unsigned i = 0; i != MaskNumElts; ++i) { - int Idx = Mask[i]; + // two subvectors and do the shuffle. + int StartIdx[2] = { -1, -1 }; // StartIdx to extract from + bool CanExtract = true; + for (int Idx : Mask) { unsigned Input = 0; if (Idx < 0) continue; @@ -3083,41 +3163,28 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) { Input = 1; Idx -= SrcNumElts; } - if (Idx > MaxRange[Input]) - MaxRange[Input] = Idx; - if (Idx < MinRange[Input]) - MinRange[Input] = Idx; - } - - // Check if the access is smaller than the vector size and can we find - // a reasonable extract index. - int RangeUse[2] = { -1, -1 }; // 0 = Unused, 1 = Extract, -1 = Can not - // Extract. - int StartIdx[2]; // StartIdx to extract from - for (unsigned Input = 0; Input < 2; ++Input) { - if (MinRange[Input] >= (int)SrcNumElts && MaxRange[Input] < 0) { - RangeUse[Input] = 0; // Unused - StartIdx[Input] = 0; - continue; - } - // Find a good start index that is a multiple of the mask length. Then - // see if the rest of the elements are in range. - StartIdx[Input] = (MinRange[Input]/MaskNumElts)*MaskNumElts; - if (MaxRange[Input] - StartIdx[Input] < (int)MaskNumElts && - StartIdx[Input] + MaskNumElts <= SrcNumElts) - RangeUse[Input] = 1; // Extract from a multiple of the mask length. + // If all the indices come from the same MaskNumElts sized portion of + // the sources we can use extract. Also make sure the extract wouldn't + // extract past the end of the source. + int NewStartIdx = alignDown(Idx, MaskNumElts); + if (NewStartIdx + MaskNumElts > SrcNumElts || + (StartIdx[Input] >= 0 && StartIdx[Input] != NewStartIdx)) + CanExtract = false; + // Make sure we always update StartIdx as we use it to track if all + // elements are undef. + StartIdx[Input] = NewStartIdx; } - if (RangeUse[0] == 0 && RangeUse[1] == 0) { + if (StartIdx[0] < 0 && StartIdx[1] < 0) { setValue(&I, DAG.getUNDEF(VT)); // Vectors are not used. return; } - if (RangeUse[0] >= 0 && RangeUse[1] >= 0) { + if (CanExtract) { // Extract appropriate subvector and generate a vector shuffle for (unsigned Input = 0; Input < 2; ++Input) { SDValue &Src = Input == 0 ? Src1 : Src2; - if (RangeUse[Input] == 0) + if (StartIdx[Input] < 0) Src = DAG.getUNDEF(VT); else { Src = DAG.getNode( @@ -3128,16 +3195,12 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) { } // Calculate new mask. - SmallVector<int, 8> MappedOps; - for (unsigned i = 0; i != MaskNumElts; ++i) { - int Idx = Mask[i]; - if (Idx >= 0) { - if (Idx < (int)SrcNumElts) - Idx -= StartIdx[0]; - else - Idx -= SrcNumElts + StartIdx[1] - MaskNumElts; - } - MappedOps.push_back(Idx); + SmallVector<int, 8> MappedOps(Mask.begin(), Mask.end()); + for (int &Idx : MappedOps) { + if (Idx >= (int)SrcNumElts) + Idx -= SrcNumElts + StartIdx[1] - MaskNumElts; + else if (Idx >= 0) + Idx -= StartIdx[0]; } setValue(&I, DAG.getVectorShuffle(VT, DL, Src1, Src2, MappedOps)); @@ -3151,8 +3214,7 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) { EVT EltVT = VT.getVectorElementType(); EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); SmallVector<SDValue,8> Ops; - for (unsigned i = 0; i != MaskNumElts; ++i) { - int Idx = Mask[i]; + for (int Idx : Mask) { SDValue Res; if (Idx < 0) { @@ -3168,10 +3230,16 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) { Ops.push_back(Res); } - setValue(&I, DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Ops)); + setValue(&I, DAG.getBuildVector(VT, DL, Ops)); } -void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) { +void SelectionDAGBuilder::visitInsertValue(const User &I) { + ArrayRef<unsigned> Indices; + if (const InsertValueInst *IV = dyn_cast<InsertValueInst>(&I)) + Indices = IV->getIndices(); + else + Indices = cast<ConstantExpr>(&I)->getIndices(); + const Value *Op0 = I.getOperand(0); const Value *Op1 = I.getOperand(1); Type *AggTy = I.getType(); @@ -3179,7 +3247,7 @@ void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) { bool IntoUndef = isa<UndefValue>(Op0); bool FromUndef = isa<UndefValue>(Op1); - unsigned LinearIndex = ComputeLinearIndex(AggTy, I.getIndices()); + unsigned LinearIndex = ComputeLinearIndex(AggTy, Indices); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SmallVector<EVT, 4> AggValueVTs; @@ -3219,13 +3287,19 @@ void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) { DAG.getVTList(AggValueVTs), Values)); } -void SelectionDAGBuilder::visitExtractValue(const ExtractValueInst &I) { +void SelectionDAGBuilder::visitExtractValue(const User &I) { + ArrayRef<unsigned> Indices; + if (const ExtractValueInst *EV = dyn_cast<ExtractValueInst>(&I)) + Indices = EV->getIndices(); + else + Indices = cast<ConstantExpr>(&I)->getIndices(); + const Value *Op0 = I.getOperand(0); Type *AggTy = Op0->getType(); Type *ValTy = I.getType(); bool OutOfUndef = isa<UndefValue>(Op0); - unsigned LinearIndex = ComputeLinearIndex(AggTy, I.getIndices()); + unsigned LinearIndex = ComputeLinearIndex(AggTy, Indices); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SmallVector<EVT, 4> ValValueVTs; @@ -3281,14 +3355,14 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) { // N = N + Offset uint64_t Offset = DL->getStructLayout(StTy)->getElementOffset(Field); - // In an inbouds GEP with an offset that is nonnegative even when + // In an inbounds GEP with an offset that is nonnegative even when // interpreted as signed, assume there is no unsigned overflow. SDNodeFlags Flags; if (int64_t(Offset) >= 0 && cast<GEPOperator>(I).isInBounds()) Flags.setNoUnsignedWrap(true); N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, - DAG.getConstant(Offset, dl, N.getValueType()), &Flags); + DAG.getConstant(Offset, dl, N.getValueType()), Flags); } } else { MVT PtrTy = @@ -3318,7 +3392,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) { if (Offs.isNonNegative() && cast<GEPOperator>(I).isInBounds()) Flags.setNoUnsignedWrap(true); - N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, OffsVal, &Flags); + N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, OffsVal, Flags); continue; } @@ -3326,7 +3400,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) { SDValue IdxN = getValue(Idx); if (!IdxN.getValueType().isVector() && VectorWidth) { - MVT VT = MVT::getVectorVT(IdxN.getValueType().getSimpleVT(), VectorWidth); + EVT VT = EVT::getVectorVT(*Context, IdxN.getValueType(), VectorWidth); IdxN = DAG.getSplatBuildVector(VT, dl, IdxN); } @@ -3396,7 +3470,7 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) { Flags.setNoUnsignedWrap(true); AllocSize = DAG.getNode(ISD::ADD, dl, AllocSize.getValueType(), AllocSize, - DAG.getIntPtrConstant(StackAlign - 1, dl), &Flags); + DAG.getIntPtrConstant(StackAlign - 1, dl), Flags); // Mask out the low bits for alignment purposes. AllocSize = DAG.getNode(ISD::AND, dl, @@ -3459,7 +3533,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) { if (isVolatile || NumValues > MaxParallelChains) // Serialize volatile loads with other side effects. Root = getRoot(); - else if (AA->pointsToConstantMemory(MemoryLocation( + else if (AA && AA->pointsToConstantMemory(MemoryLocation( SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo))) { // Do not serialize (non-volatile) loads of constant memory with anything. Root = DAG.getEntryNode(); @@ -3500,7 +3574,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) { SDValue A = DAG.getNode(ISD::ADD, dl, PtrVT, Ptr, DAG.getConstant(Offsets[i], dl, PtrVT), - &Flags); + Flags); auto MMOFlags = MachineMemOperand::MONone; if (isVolatile) MMOFlags |= MachineMemOperand::MOVolatile; @@ -3510,6 +3584,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) { MMOFlags |= MachineMemOperand::MOInvariant; if (isDereferenceable) MMOFlags |= MachineMemOperand::MODereferenceable; + MMOFlags |= TLI.getMMOFlags(I); SDValue L = DAG.getLoad(ValueVTs[i], dl, Root, A, MachinePointerInfo(SV, Offsets[i]), Alignment, @@ -3533,8 +3608,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) { } void SelectionDAGBuilder::visitStoreToSwiftError(const StoreInst &I) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - assert(TLI.supportSwiftError() && + assert(DAG.getTargetLoweringInfo().supportSwiftError() && "call visitStoreToSwiftError when backend supports swifterror"); SmallVector<EVT, 4> ValueVTs; @@ -3547,15 +3621,15 @@ void SelectionDAGBuilder::visitStoreToSwiftError(const StoreInst &I) { SDValue Src = getValue(SrcV); // Create a virtual register, then update the virtual register. - auto &DL = DAG.getDataLayout(); - const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL)); - unsigned VReg = FuncInfo.MF->getRegInfo().createVirtualRegister(RC); + unsigned VReg; bool CreatedVReg; + std::tie(VReg, CreatedVReg) = FuncInfo.getOrCreateSwiftErrorVRegDefAt(&I); // Chain, DL, Reg, N or Chain, DL, Reg, N, Glue // Chain can be getRoot or getControlRoot. SDValue CopyNode = DAG.getCopyToReg(getRoot(), getCurSDLoc(), VReg, SDValue(Src.getNode(), Src.getResNo())); DAG.setRoot(CopyNode); - FuncInfo.setCurrentSwiftErrorVReg(FuncInfo.MBB, I.getOperand(1), VReg); + if (CreatedVReg) + FuncInfo.setCurrentSwiftErrorVReg(FuncInfo.MBB, I.getOperand(1), VReg); } void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) { @@ -3571,8 +3645,8 @@ void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) { Type *Ty = I.getType(); AAMDNodes AAInfo; I.getAAMetadata(AAInfo); - assert(!AA->pointsToConstantMemory(MemoryLocation( - SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo)) && + assert((!AA || !AA->pointsToConstantMemory(MemoryLocation( + SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo))) && "load_from_swift_error should not be constant memory"); SmallVector<EVT, 4> ValueVTs; @@ -3585,7 +3659,8 @@ void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) { // Chain, DL, Reg, VT, Glue or Chain, DL, Reg, VT SDValue L = DAG.getCopyFromReg( getRoot(), getCurSDLoc(), - FuncInfo.getOrCreateSwiftErrorVReg(FuncInfo.MBB, SV), ValueVTs[0]); + FuncInfo.getOrCreateSwiftErrorVRegUseAt(&I, FuncInfo.MBB, SV).first, + ValueVTs[0]); setValue(&I, L); } @@ -3639,6 +3714,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) { MMOFlags |= MachineMemOperand::MOVolatile; if (I.getMetadata(LLVMContext::MD_nontemporal) != nullptr) MMOFlags |= MachineMemOperand::MONonTemporal; + MMOFlags |= TLI.getMMOFlags(I); // An aggregate load cannot wrap around the address space, so offsets to its // parts don't wrap either. @@ -3655,7 +3731,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) { ChainI = 0; } SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, Ptr, - DAG.getConstant(Offsets[i], dl, PtrVT), &Flags); + DAG.getConstant(Offsets[i], dl, PtrVT), Flags); SDValue St = DAG.getStore( Root, dl, SDValue(Src.getNode(), Src.getResNo() + i), Add, MachinePointerInfo(PtrV, Offsets[i]), Alignment, MMOFlags, AAInfo); @@ -3853,7 +3929,7 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) { const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range); // Do not serialize masked loads of constant memory with anything. - bool AddToChain = !AA->pointsToConstantMemory(MemoryLocation( + bool AddToChain = !AA || !AA->pointsToConstantMemory(MemoryLocation( PtrOperand, DAG.getDataLayout().getTypeStoreSize(I.getType()), AAInfo)); SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode(); @@ -3897,7 +3973,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) { bool UniformBase = getUniformBase(BasePtr, Base, Index, this); bool ConstantMemory = false; if (UniformBase && - AA->pointsToConstantMemory(MemoryLocation( + AA && AA->pointsToConstantMemory(MemoryLocation( BasePtr, DAG.getDataLayout().getTypeStoreSize(I.getType()), AAInfo))) { // Do not serialize (non-volatile) loads of constant memory with anything. @@ -3929,7 +4005,7 @@ void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) { SDLoc dl = getCurSDLoc(); AtomicOrdering SuccessOrder = I.getSuccessOrdering(); AtomicOrdering FailureOrder = I.getFailureOrdering(); - SynchronizationScope Scope = I.getSynchScope(); + SyncScope::ID SSID = I.getSyncScopeID(); SDValue InChain = getRoot(); @@ -3939,7 +4015,7 @@ void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) { ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, MemVT, VTs, InChain, getValue(I.getPointerOperand()), getValue(I.getCompareOperand()), getValue(I.getNewValOperand()), MachinePointerInfo(I.getPointerOperand()), - /*Alignment=*/ 0, SuccessOrder, FailureOrder, Scope); + /*Alignment=*/ 0, SuccessOrder, FailureOrder, SSID); SDValue OutChain = L.getValue(2); @@ -3965,7 +4041,7 @@ void SelectionDAGBuilder::visitAtomicRMW(const AtomicRMWInst &I) { case AtomicRMWInst::UMin: NT = ISD::ATOMIC_LOAD_UMIN; break; } AtomicOrdering Order = I.getOrdering(); - SynchronizationScope Scope = I.getSynchScope(); + SyncScope::ID SSID = I.getSyncScopeID(); SDValue InChain = getRoot(); @@ -3976,7 +4052,7 @@ void SelectionDAGBuilder::visitAtomicRMW(const AtomicRMWInst &I) { getValue(I.getPointerOperand()), getValue(I.getValOperand()), I.getPointerOperand(), - /* Alignment=*/ 0, Order, Scope); + /* Alignment=*/ 0, Order, SSID); SDValue OutChain = L.getValue(1); @@ -3990,16 +4066,16 @@ void SelectionDAGBuilder::visitFence(const FenceInst &I) { SDValue Ops[3]; Ops[0] = getRoot(); Ops[1] = DAG.getConstant((unsigned)I.getOrdering(), dl, - TLI.getPointerTy(DAG.getDataLayout())); - Ops[2] = DAG.getConstant(I.getSynchScope(), dl, - TLI.getPointerTy(DAG.getDataLayout())); + TLI.getFenceOperandTy(DAG.getDataLayout())); + Ops[2] = DAG.getConstant(I.getSyncScopeID(), dl, + TLI.getFenceOperandTy(DAG.getDataLayout())); DAG.setRoot(DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops)); } void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) { SDLoc dl = getCurSDLoc(); AtomicOrdering Order = I.getOrdering(); - SynchronizationScope Scope = I.getSynchScope(); + SyncScope::ID SSID = I.getSyncScopeID(); SDValue InChain = getRoot(); @@ -4017,7 +4093,7 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) { VT.getStoreSize(), I.getAlignment() ? I.getAlignment() : DAG.getEVTAlignment(VT), - AAMDNodes(), nullptr, Scope, Order); + AAMDNodes(), nullptr, SSID, Order); InChain = TLI.prepareVolatileOrAtomicLoad(InChain, dl, DAG); SDValue L = @@ -4034,7 +4110,7 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) { SDLoc dl = getCurSDLoc(); AtomicOrdering Order = I.getOrdering(); - SynchronizationScope Scope = I.getSynchScope(); + SyncScope::ID SSID = I.getSyncScopeID(); SDValue InChain = getRoot(); @@ -4051,7 +4127,7 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) { getValue(I.getPointerOperand()), getValue(I.getValueOperand()), I.getPointerOperand(), I.getAlignment(), - Order, Scope); + Order, SSID); DAG.setRoot(OutChain); } @@ -4695,7 +4771,7 @@ static unsigned getUnderlyingArgReg(const SDValue &N) { /// At the end of instruction selection, they will be inserted to the entry BB. bool SelectionDAGBuilder::EmitFuncArgumentDbgValue( const Value *V, DILocalVariable *Variable, DIExpression *Expr, - DILocation *DL, int64_t Offset, bool IsIndirect, const SDValue &N) { + DILocation *DL, int64_t Offset, bool IsDbgDeclare, const SDValue &N) { const Argument *Arg = dyn_cast<Argument>(V); if (!Arg) return false; @@ -4709,9 +4785,11 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue( if (!Variable->getScope()->getSubprogram()->describes(MF.getFunction())) return false; + bool IsIndirect = false; Optional<MachineOperand> Op; // Some arguments' frame index is recorded during argument lowering. - if (int FI = FuncInfo.getArgumentFrameIndex(Arg)) + int FI = FuncInfo.getArgumentFrameIndex(Arg); + if (FI != INT_MAX) Op = MachineOperand::CreateFI(FI); if (!Op && N.getNode()) { @@ -4722,15 +4800,19 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue( if (PR) Reg = PR; } - if (Reg) + if (Reg) { Op = MachineOperand::CreateReg(Reg, false); + IsIndirect = IsDbgDeclare; + } } if (!Op) { // Check if ValueMap has reg number. DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V); - if (VMI != FuncInfo.ValueMap.end()) + if (VMI != FuncInfo.ValueMap.end()) { Op = MachineOperand::CreateReg(VMI->second, false); + IsIndirect = IsDbgDeclare; + } } if (!Op && N.getNode()) @@ -4752,7 +4834,7 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue( else FuncInfo.ArgDbgValues.push_back( BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE)) - .addOperand(*Op) + .add(*Op) .addImm(Offset) .addMetadata(Variable) .addMetadata(Expr)); @@ -4764,26 +4846,17 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue( SDDbgValue *SelectionDAGBuilder::getDbgValue(SDValue N, DILocalVariable *Variable, DIExpression *Expr, int64_t Offset, - DebugLoc dl, + const DebugLoc &dl, unsigned DbgSDNodeOrder) { - SDDbgValue *SDV; - auto *FISDN = dyn_cast<FrameIndexSDNode>(N.getNode()); - if (FISDN && Expr->startsWithDeref()) { + if (auto *FISDN = dyn_cast<FrameIndexSDNode>(N.getNode())) { // Construct a FrameIndexDbgValue for FrameIndexSDNodes so we can describe // stack slot locations as such instead of as indirectly addressed // locations. - ArrayRef<uint64_t> TrailingElements(Expr->elements_begin() + 1, - Expr->elements_end()); - DIExpression *DerefedDIExpr = - DIExpression::get(*DAG.getContext(), TrailingElements); - int FI = FISDN->getIndex(); - SDV = DAG.getFrameIndexDbgValue(Variable, DerefedDIExpr, FI, 0, dl, - DbgSDNodeOrder); - } else { - SDV = DAG.getDbgValue(Variable, Expr, N.getNode(), N.getResNo(), false, - Offset, dl, DbgSDNodeOrder); + return DAG.getFrameIndexDbgValue(Variable, Expr, FISDN->getIndex(), 0, dl, + DbgSDNodeOrder); } - return SDV; + return DAG.getDbgValue(Variable, Expr, N.getNode(), N.getResNo(), false, + Offset, dl, DbgSDNodeOrder); } // VisualStudio defines setjmp as _setjmp @@ -4794,9 +4867,9 @@ SDDbgValue *SelectionDAGBuilder::getDbgValue(SDValue N, # define setjmp_undefined_for_msvc #endif -/// visitIntrinsicCall - Lower the call to the specified intrinsic function. If -/// we want to emit this as a call to a named external function, return the name -/// otherwise lower it and return null. +/// Lower the call to the specified intrinsic function. If we want to emit this +/// as a call to a named external function, return the name. Otherwise, lower it +/// and return null. const char * SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -4897,11 +4970,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { updateDAGForMaybeTailCall(MM); return nullptr; } - case Intrinsic::memcpy_element_atomic: { - SDValue Dst = getValue(I.getArgOperand(0)); - SDValue Src = getValue(I.getArgOperand(1)); - SDValue NumElements = getValue(I.getArgOperand(2)); - SDValue ElementSize = getValue(I.getArgOperand(3)); + case Intrinsic::memcpy_element_unordered_atomic: { + const ElementUnorderedAtomicMemCpyInst &MI = + cast<ElementUnorderedAtomicMemCpyInst>(I); + SDValue Dst = getValue(MI.getRawDest()); + SDValue Src = getValue(MI.getRawSource()); + SDValue Length = getValue(MI.getLength()); // Emit a library call. TargetLowering::ArgListTy Args; @@ -4912,31 +4986,101 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { Entry.Node = Src; Args.push_back(Entry); - - Entry.Ty = I.getArgOperand(2)->getType(); - Entry.Node = NumElements; + + Entry.Ty = MI.getLength()->getType(); + Entry.Node = Length; + Args.push_back(Entry); + + uint64_t ElementSizeConstant = MI.getElementSizeInBytes(); + RTLIB::Libcall LibraryCall = + RTLIB::getMEMCPY_ELEMENT_UNORDERED_ATOMIC(ElementSizeConstant); + if (LibraryCall == RTLIB::UNKNOWN_LIBCALL) + report_fatal_error("Unsupported element size"); + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee( + TLI.getLibcallCallingConv(LibraryCall), + Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol(TLI.getLibcallName(LibraryCall), + TLI.getPointerTy(DAG.getDataLayout())), + std::move(Args)); + + std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); + DAG.setRoot(CallResult.second); + return nullptr; + } + case Intrinsic::memmove_element_unordered_atomic: { + auto &MI = cast<ElementUnorderedAtomicMemMoveInst>(I); + SDValue Dst = getValue(MI.getRawDest()); + SDValue Src = getValue(MI.getRawSource()); + SDValue Length = getValue(MI.getLength()); + + // Emit a library call. + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); + Entry.Node = Dst; Args.push_back(Entry); - - Entry.Ty = Type::getInt32Ty(*DAG.getContext()); - Entry.Node = ElementSize; + + Entry.Node = Src; Args.push_back(Entry); - uint64_t ElementSizeConstant = - cast<ConstantInt>(I.getArgOperand(3))->getZExtValue(); + Entry.Ty = MI.getLength()->getType(); + Entry.Node = Length; + Args.push_back(Entry); + + uint64_t ElementSizeConstant = MI.getElementSizeInBytes(); + RTLIB::Libcall LibraryCall = + RTLIB::getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(ElementSizeConstant); + if (LibraryCall == RTLIB::UNKNOWN_LIBCALL) + report_fatal_error("Unsupported element size"); + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee( + TLI.getLibcallCallingConv(LibraryCall), + Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol(TLI.getLibcallName(LibraryCall), + TLI.getPointerTy(DAG.getDataLayout())), + std::move(Args)); + + std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); + DAG.setRoot(CallResult.second); + return nullptr; + } + case Intrinsic::memset_element_unordered_atomic: { + auto &MI = cast<ElementUnorderedAtomicMemSetInst>(I); + SDValue Dst = getValue(MI.getRawDest()); + SDValue Val = getValue(MI.getValue()); + SDValue Length = getValue(MI.getLength()); + + // Emit a library call. + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); + Entry.Node = Dst; + Args.push_back(Entry); + + Entry.Ty = Type::getInt8Ty(*DAG.getContext()); + Entry.Node = Val; + Args.push_back(Entry); + + Entry.Ty = MI.getLength()->getType(); + Entry.Node = Length; + Args.push_back(Entry); + + uint64_t ElementSizeConstant = MI.getElementSizeInBytes(); RTLIB::Libcall LibraryCall = - RTLIB::getMEMCPY_ELEMENT_ATOMIC(ElementSizeConstant); + RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(ElementSizeConstant); if (LibraryCall == RTLIB::UNKNOWN_LIBCALL) report_fatal_error("Unsupported element size"); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(sdl) - .setChain(getRoot()) - .setCallee(TLI.getLibcallCallingConv(LibraryCall), - Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol( - TLI.getLibcallName(LibraryCall), - TLI.getPointerTy(DAG.getDataLayout())), - std::move(Args)); + CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee( + TLI.getLibcallCallingConv(LibraryCall), + Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol(TLI.getLibcallName(LibraryCall), + TLI.getPointerTy(DAG.getDataLayout())), + std::move(Args)); std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); DAG.setRoot(CallResult.second); @@ -4960,6 +5104,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { return nullptr; } + // Byval arguments with frame indices were already handled after argument + // lowering and before isel. + const auto *Arg = + dyn_cast<Argument>(Address->stripInBoundsConstantOffsets()); + if (Arg && FuncInfo.getArgumentFrameIndex(Arg) != INT_MAX) + return nullptr; + SDValue &N = NodeMap[Address]; if (!N.getNode() && isa<Argument>(Address)) // Check unused arguments map. @@ -4978,8 +5129,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { } else if (isa<Argument>(Address)) { // Address is an argument, so try to emit its dbg value using // virtual register info from the FuncInfo.ValueMap. - EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, false, - N); + EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, true, N); return nullptr; } else { SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(), @@ -4989,22 +5139,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { } else { // If Address is an argument then try to emit its dbg value using // virtual register info from the FuncInfo.ValueMap. - if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, false, + if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, true, N)) { - // If variable is pinned by a alloca in dominating bb then - // use StaticAllocaMap. - if (const AllocaInst *AI = dyn_cast<AllocaInst>(Address)) { - if (AI->getParent() != DI.getParent()) { - DenseMap<const AllocaInst*, int>::iterator SI = - FuncInfo.StaticAllocaMap.find(AI); - if (SI != FuncInfo.StaticAllocaMap.end()) { - SDV = DAG.getFrameIndexDbgValue(Variable, Expression, SI->second, - 0, dl, SDNodeOrder); - DAG.AddDbgValue(SDV, nullptr, false); - return nullptr; - } - } - } DEBUG(dbgs() << "Dropping debug info for " << DI << "\n"); } } @@ -5026,45 +5162,33 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { SDV = DAG.getConstantDbgValue(Variable, Expression, V, Offset, dl, SDNodeOrder); DAG.AddDbgValue(SDV, nullptr, false); - } else { - // Do not use getValue() in here; we don't want to generate code at - // this point if it hasn't been done yet. - SDValue N = NodeMap[V]; - if (!N.getNode() && isa<Argument>(V)) - // Check unused arguments map. - N = UnusedArgNodeMap[V]; - if (N.getNode()) { - if (!EmitFuncArgumentDbgValue(V, Variable, Expression, dl, Offset, - false, N)) { - SDV = getDbgValue(N, Variable, Expression, Offset, dl, SDNodeOrder); - DAG.AddDbgValue(SDV, N.getNode(), false); - } - } else if (!V->use_empty() ) { - // Do not call getValue(V) yet, as we don't want to generate code. - // Remember it for later. - DanglingDebugInfo DDI(&DI, dl, SDNodeOrder); - DanglingDebugInfoMap[V] = DDI; - } else { - // We may expand this to cover more cases. One case where we have no - // data available is an unreferenced parameter. - DEBUG(dbgs() << "Dropping debug info for " << DI << "\n"); - } + return nullptr; } - // Build a debug info table entry. - if (const BitCastInst *BCI = dyn_cast<BitCastInst>(V)) - V = BCI->getOperand(0); - const AllocaInst *AI = dyn_cast<AllocaInst>(V); - // Don't handle byval struct arguments or VLAs, for example. - if (!AI) { - DEBUG(dbgs() << "Dropping debug location info for:\n " << DI << "\n"); - DEBUG(dbgs() << " Last seen at:\n " << *V << "\n"); + // Do not use getValue() in here; we don't want to generate code at + // this point if it hasn't been done yet. + SDValue N = NodeMap[V]; + if (!N.getNode() && isa<Argument>(V)) // Check unused arguments map. + N = UnusedArgNodeMap[V]; + if (N.getNode()) { + if (EmitFuncArgumentDbgValue(V, Variable, Expression, dl, Offset, false, + N)) + return nullptr; + SDV = getDbgValue(N, Variable, Expression, Offset, dl, SDNodeOrder); + DAG.AddDbgValue(SDV, N.getNode(), false); return nullptr; } - DenseMap<const AllocaInst*, int>::iterator SI = - FuncInfo.StaticAllocaMap.find(AI); - if (SI == FuncInfo.StaticAllocaMap.end()) - return nullptr; // VLAs. + + if (!V->use_empty() ) { + // Do not call getValue(V) yet, as we don't want to generate code. + // Remember it for later. + DanglingDebugInfo DDI(&DI, dl, SDNodeOrder); + DanglingDebugInfoMap[V] = DDI; + return nullptr; + } + + DEBUG(dbgs() << "Dropping debug location info for:\n " << DI << "\n"); + DEBUG(dbgs() << " Last seen at:\n " << *V << "\n"); return nullptr; } @@ -5202,7 +5326,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { SDValue ShOps[2]; ShOps[0] = ShAmt; ShOps[1] = DAG.getConstant(0, sdl, MVT::i32); - ShAmt = DAG.getNode(ISD::BUILD_VECTOR, sdl, ShAmtVT, ShOps); + ShAmt = DAG.getBuildVector(ShAmtVT, sdl, ShOps); EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); ShAmt = DAG.getNode(ISD::BITCAST, sdl, DestVT, ShAmt); Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, sdl, DestVT, @@ -5301,6 +5425,25 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { getValue(I.getArgOperand(1)), getValue(I.getArgOperand(2)))); return nullptr; + case Intrinsic::experimental_constrained_fadd: + case Intrinsic::experimental_constrained_fsub: + case Intrinsic::experimental_constrained_fmul: + case Intrinsic::experimental_constrained_fdiv: + case Intrinsic::experimental_constrained_frem: + case Intrinsic::experimental_constrained_sqrt: + case Intrinsic::experimental_constrained_pow: + case Intrinsic::experimental_constrained_powi: + case Intrinsic::experimental_constrained_sin: + case Intrinsic::experimental_constrained_cos: + case Intrinsic::experimental_constrained_exp: + case Intrinsic::experimental_constrained_exp2: + case Intrinsic::experimental_constrained_log: + case Intrinsic::experimental_constrained_log10: + case Intrinsic::experimental_constrained_log2: + case Intrinsic::experimental_constrained_rint: + case Intrinsic::experimental_constrained_nearbyint: + visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(I)); + return nullptr; case Intrinsic::fmuladd: { EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict && @@ -5537,7 +5680,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { case Intrinsic::trap: { StringRef TrapFuncName = I.getAttributes() - .getAttribute(AttributeSet::FunctionIndex, "trap-func-name") + .getAttribute(AttributeList::FunctionIndex, "trap-func-name") .getValueAsString(); if (TrapFuncName.empty()) { ISD::NodeType Op = (Intrinsic == Intrinsic::trap) ? @@ -5548,7 +5691,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { TargetLowering::ArgListTy Args; TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(sdl).setChain(getRoot()).setCallee( + CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee( CallingConv::C, I.getType(), DAG.getExternalSymbol(TrapFuncName.data(), TLI.getPointerTy(DAG.getDataLayout())), @@ -5629,7 +5772,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { SDValue Ops[2]; Ops[0] = getRoot(); Ops[1] = - DAG.getFrameIndex(FI, TLI.getPointerTy(DAG.getDataLayout()), true); + DAG.getFrameIndex(FI, TLI.getFrameIndexTy(DAG.getDataLayout()), true); unsigned Opcode = (IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END); Res = DAG.getNode(Opcode, sdl, MVT::Other, Ops); @@ -5690,7 +5833,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { int FI = FuncInfo.StaticAllocaMap[Slot]; MCSymbol *FrameAllocSym = MF.getMMI().getContext().getOrCreateFrameAllocSymbol( - GlobalValue::getRealLinkageName(MF.getName()), Idx); + GlobalValue::dropLLVMManglingEscape(MF.getName()), Idx); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, dl, TII->get(TargetOpcode::LOCAL_ESCAPE)) .addSym(FrameAllocSym) @@ -5711,7 +5854,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { unsigned IdxVal = unsigned(Idx->getLimitedValue(INT_MAX)); MCSymbol *FrameAllocSym = MF.getMMI().getContext().getOrCreateFrameAllocSymbol( - GlobalValue::getRealLinkageName(Fn->getName()), IdxVal); + GlobalValue::dropLLVMManglingEscape(Fn->getName()), IdxVal); // Create a MCSymbol for the label to avoid any target lowering // that would make this PC relative. @@ -5742,13 +5885,142 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { setValue(&I, N); return nullptr; } + case Intrinsic::xray_customevent: { + // Here we want to make sure that the intrinsic behaves as if it has a + // specific calling convention, and only for x86_64. + // FIXME: Support other platforms later. + const auto &Triple = DAG.getTarget().getTargetTriple(); + if (Triple.getArch() != Triple::x86_64 || !Triple.isOSLinux()) + return nullptr; + SDLoc DL = getCurSDLoc(); + SmallVector<SDValue, 8> Ops; + + // We want to say that we always want the arguments in registers. + SDValue LogEntryVal = getValue(I.getArgOperand(0)); + SDValue StrSizeVal = getValue(I.getArgOperand(1)); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Chain = getRoot(); + Ops.push_back(LogEntryVal); + Ops.push_back(StrSizeVal); + Ops.push_back(Chain); + + // We need to enforce the calling convention for the callsite, so that + // argument ordering is enforced correctly, and that register allocation can + // see that some registers may be assumed clobbered and have to preserve + // them across calls to the intrinsic. + MachineSDNode *MN = DAG.getMachineNode(TargetOpcode::PATCHABLE_EVENT_CALL, + DL, NodeTys, Ops); + SDValue patchableNode = SDValue(MN, 0); + DAG.setRoot(patchableNode); + setValue(&I, patchableNode); + return nullptr; + } case Intrinsic::experimental_deoptimize: LowerDeoptimizeCall(&I); return nullptr; + + case Intrinsic::experimental_vector_reduce_fadd: + case Intrinsic::experimental_vector_reduce_fmul: + case Intrinsic::experimental_vector_reduce_add: + case Intrinsic::experimental_vector_reduce_mul: + case Intrinsic::experimental_vector_reduce_and: + case Intrinsic::experimental_vector_reduce_or: + case Intrinsic::experimental_vector_reduce_xor: + case Intrinsic::experimental_vector_reduce_smax: + case Intrinsic::experimental_vector_reduce_smin: + case Intrinsic::experimental_vector_reduce_umax: + case Intrinsic::experimental_vector_reduce_umin: + case Intrinsic::experimental_vector_reduce_fmax: + case Intrinsic::experimental_vector_reduce_fmin: { + visitVectorReduce(I, Intrinsic); + return nullptr; + } + } } +void SelectionDAGBuilder::visitConstrainedFPIntrinsic( + const ConstrainedFPIntrinsic &FPI) { + SDLoc sdl = getCurSDLoc(); + unsigned Opcode; + switch (FPI.getIntrinsicID()) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::experimental_constrained_fadd: + Opcode = ISD::STRICT_FADD; + break; + case Intrinsic::experimental_constrained_fsub: + Opcode = ISD::STRICT_FSUB; + break; + case Intrinsic::experimental_constrained_fmul: + Opcode = ISD::STRICT_FMUL; + break; + case Intrinsic::experimental_constrained_fdiv: + Opcode = ISD::STRICT_FDIV; + break; + case Intrinsic::experimental_constrained_frem: + Opcode = ISD::STRICT_FREM; + break; + case Intrinsic::experimental_constrained_sqrt: + Opcode = ISD::STRICT_FSQRT; + break; + case Intrinsic::experimental_constrained_pow: + Opcode = ISD::STRICT_FPOW; + break; + case Intrinsic::experimental_constrained_powi: + Opcode = ISD::STRICT_FPOWI; + break; + case Intrinsic::experimental_constrained_sin: + Opcode = ISD::STRICT_FSIN; + break; + case Intrinsic::experimental_constrained_cos: + Opcode = ISD::STRICT_FCOS; + break; + case Intrinsic::experimental_constrained_exp: + Opcode = ISD::STRICT_FEXP; + break; + case Intrinsic::experimental_constrained_exp2: + Opcode = ISD::STRICT_FEXP2; + break; + case Intrinsic::experimental_constrained_log: + Opcode = ISD::STRICT_FLOG; + break; + case Intrinsic::experimental_constrained_log10: + Opcode = ISD::STRICT_FLOG10; + break; + case Intrinsic::experimental_constrained_log2: + Opcode = ISD::STRICT_FLOG2; + break; + case Intrinsic::experimental_constrained_rint: + Opcode = ISD::STRICT_FRINT; + break; + case Intrinsic::experimental_constrained_nearbyint: + Opcode = ISD::STRICT_FNEARBYINT; + break; + } + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Chain = getRoot(); + SmallVector<EVT, 4> ValueVTs; + ComputeValueVTs(TLI, DAG.getDataLayout(), FPI.getType(), ValueVTs); + ValueVTs.push_back(MVT::Other); // Out chain + + SDVTList VTs = DAG.getVTList(ValueVTs); + SDValue Result; + if (FPI.isUnaryOp()) + Result = DAG.getNode(Opcode, sdl, VTs, + { Chain, getValue(FPI.getArgOperand(0)) }); + else + Result = DAG.getNode(Opcode, sdl, VTs, + { Chain, getValue(FPI.getArgOperand(0)), + getValue(FPI.getArgOperand(1)) }); + + assert(Result.getNode()->getNumValues() == 2); + SDValue OutChain = Result.getValue(1); + DAG.setRoot(OutChain); + SDValue FPResult = Result.getValue(0); + setValue(&FPI, FPResult); +} + std::pair<SDValue, SDValue> SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI, const BasicBlock *EHPadBB) { @@ -5827,7 +6099,6 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, Type *RetTy = CS.getType(); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; Args.reserve(CS.arg_size()); const Value *SwiftErrorVal = nullptr; @@ -5843,6 +6114,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); i != e; ++i) { + TargetLowering::ArgListEntry Entry; const Value *V = *i; // Skip empty types @@ -5852,24 +6124,25 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, SDValue ArgNode = getValue(V); Entry.Node = ArgNode; Entry.Ty = V->getType(); - // Skip the first return-type Attribute to get to params. - Entry.setAttributes(&CS, i - CS.arg_begin() + 1); + Entry.setAttributes(&CS, i - CS.arg_begin()); // Use swifterror virtual register as input to the call. - if (Entry.isSwiftError && TLI.supportSwiftError()) { + if (Entry.IsSwiftError && TLI.supportSwiftError()) { SwiftErrorVal = V; // We find the virtual register for the actual swifterror argument. // Instead of using the Value, we use the virtual register instead. - Entry.Node = - DAG.getRegister(FuncInfo.getOrCreateSwiftErrorVReg(FuncInfo.MBB, V), - EVT(TLI.getPointerTy(DL))); + Entry.Node = DAG.getRegister(FuncInfo + .getOrCreateSwiftErrorVRegUseAt( + CS.getInstruction(), FuncInfo.MBB, V) + .first, + EVT(TLI.getPointerTy(DL))); } Args.push_back(Entry); // If we have an explicit sret argument that is an Instruction, (i.e., it // might point to function-local memory), we can't meaningfully tail-call. - if (Entry.isSRet && isa<Instruction>(V)) + if (Entry.IsSRet && isa<Instruction>(V)) isTailCall = false; } @@ -5903,38 +6176,29 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, if (SwiftErrorVal && TLI.supportSwiftError()) { // Get the last element of InVals. SDValue Src = CLI.InVals.back(); - const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL)); - unsigned VReg = FuncInfo.MF->getRegInfo().createVirtualRegister(RC); + unsigned VReg; bool CreatedVReg; + std::tie(VReg, CreatedVReg) = + FuncInfo.getOrCreateSwiftErrorVRegDefAt(CS.getInstruction()); SDValue CopyNode = CLI.DAG.getCopyToReg(Result.second, CLI.DL, VReg, Src); // We update the virtual register for the actual swifterror argument. - FuncInfo.setCurrentSwiftErrorVReg(FuncInfo.MBB, SwiftErrorVal, VReg); + if (CreatedVReg) + FuncInfo.setCurrentSwiftErrorVReg(FuncInfo.MBB, SwiftErrorVal, VReg); DAG.setRoot(CopyNode); } } -/// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the -/// value is equal or not-equal to zero. -static bool IsOnlyUsedInZeroEqualityComparison(const Value *V) { - for (const User *U : V->users()) { - if (const ICmpInst *IC = dyn_cast<ICmpInst>(U)) - if (IC->isEquality()) - if (const Constant *C = dyn_cast<Constant>(IC->getOperand(1))) - if (C->isNullValue()) - continue; - // Unknown instruction. - return false; - } - return true; -} - static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT, - Type *LoadTy, SelectionDAGBuilder &Builder) { // Check to see if this load can be trivially constant folded, e.g. if the // input is from a string literal. if (const Constant *LoadInput = dyn_cast<Constant>(PtrVal)) { // Cast pointer to the type we really want to load. + Type *LoadTy = + Type::getIntNTy(PtrVal->getContext(), LoadVT.getScalarSizeInBits()); + if (LoadVT.isVector()) + LoadTy = VectorType::get(LoadTy, LoadVT.getVectorNumElements()); + LoadInput = ConstantExpr::getBitCast(const_cast<Constant *>(LoadInput), PointerType::getUnqual(LoadTy)); @@ -5949,7 +6213,7 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT, bool ConstantMemory = false; // Do not serialize (non-volatile) loads of constant memory with anything. - if (Builder.AA->pointsToConstantMemory(PtrVal)) { + if (Builder.AA && Builder.AA->pointsToConstantMemory(PtrVal)) { Root = Builder.DAG.getEntryNode(); ConstantMemory = true; } else { @@ -5967,8 +6231,8 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT, return LoadVal; } -/// processIntegerCallValue - Record the value for an instruction that -/// produces an integer result, converting the type where necessary. +/// Record the value for an instruction that produces an integer result, +/// converting the type where necessary. void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I, SDValue Value, bool IsSigned) { @@ -5981,20 +6245,13 @@ void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I, setValue(&I, Value); } -/// visitMemCmpCall - See if we can lower a call to memcmp in an optimized form. -/// If so, return true and lower it, otherwise return false and it will be -/// lowered like a normal call. +/// See if we can lower a memcmp call into an optimized form. If so, return +/// true and lower it. Otherwise return false, and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) { - // Verify that the prototype makes sense. int memcmp(void*,void*,size_t) - if (I.getNumArgOperands() != 3) - return false; - const Value *LHS = I.getArgOperand(0), *RHS = I.getArgOperand(1); - if (!LHS->getType()->isPointerTy() || !RHS->getType()->isPointerTy() || - !I.getArgOperand(2)->getType()->isIntegerTy() || - !I.getType()->isIntegerTy()) - return false; - const Value *Size = I.getArgOperand(2); const ConstantInt *CSize = dyn_cast<ConstantInt>(Size); if (CSize && CSize->getZExtValue() == 0) { @@ -6005,11 +6262,9 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) { } const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo(); - std::pair<SDValue, SDValue> Res = - TSI.EmitTargetCodeForMemcmp(DAG, getCurSDLoc(), DAG.getRoot(), - getValue(LHS), getValue(RHS), getValue(Size), - MachinePointerInfo(LHS), - MachinePointerInfo(RHS)); + std::pair<SDValue, SDValue> Res = TSI.EmitTargetCodeForMemcmp( + DAG, getCurSDLoc(), DAG.getRoot(), getValue(LHS), getValue(RHS), + getValue(Size), MachinePointerInfo(LHS), MachinePointerInfo(RHS)); if (Res.first.getNode()) { processIntegerCallValue(I, Res.first, true); PendingLoads.push_back(Res.second); @@ -6018,88 +6273,79 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) { // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS) != 0 // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS) != 0 - if (CSize && IsOnlyUsedInZeroEqualityComparison(&I)) { - bool ActuallyDoIt = true; - MVT LoadVT; - Type *LoadTy; - switch (CSize->getZExtValue()) { - default: - LoadVT = MVT::Other; - LoadTy = nullptr; - ActuallyDoIt = false; - break; - case 2: - LoadVT = MVT::i16; - LoadTy = Type::getInt16Ty(CSize->getContext()); - break; - case 4: - LoadVT = MVT::i32; - LoadTy = Type::getInt32Ty(CSize->getContext()); - break; - case 8: - LoadVT = MVT::i64; - LoadTy = Type::getInt64Ty(CSize->getContext()); - break; - /* - case 16: - LoadVT = MVT::v4i32; - LoadTy = Type::getInt32Ty(CSize->getContext()); - LoadTy = VectorType::get(LoadTy, 4); - break; - */ - } - - // This turns into unaligned loads. We only do this if the target natively - // supports the MVT we'll be loading or if it is small enough (<= 4) that - // we'll only produce a small number of byte loads. + if (!CSize || !isOnlyUsedInZeroEqualityComparison(&I)) + return false; - // Require that we can find a legal MVT, and only do this if the target - // supports unaligned loads of that type. Expanding into byte loads would - // bloat the code. + // If the target has a fast compare for the given size, it will return a + // preferred load type for that size. Require that the load VT is legal and + // that the target supports unaligned loads of that type. Otherwise, return + // INVALID. + auto hasFastLoadsAndCompare = [&](unsigned NumBits) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (ActuallyDoIt && CSize->getZExtValue() > 4) { - unsigned DstAS = LHS->getType()->getPointerAddressSpace(); - unsigned SrcAS = RHS->getType()->getPointerAddressSpace(); + MVT LVT = TLI.hasFastEqualityCompare(NumBits); + if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) { // TODO: Handle 5 byte compare as 4-byte + 1 byte. // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads. // TODO: Check alignment of src and dest ptrs. - if (!TLI.isTypeLegal(LoadVT) || - !TLI.allowsMisalignedMemoryAccesses(LoadVT, SrcAS) || - !TLI.allowsMisalignedMemoryAccesses(LoadVT, DstAS)) - ActuallyDoIt = false; + unsigned DstAS = LHS->getType()->getPointerAddressSpace(); + unsigned SrcAS = RHS->getType()->getPointerAddressSpace(); + if (!TLI.isTypeLegal(LVT) || + !TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) || + !TLI.allowsMisalignedMemoryAccesses(LVT, DstAS)) + LVT = MVT::INVALID_SIMPLE_VALUE_TYPE; } - if (ActuallyDoIt) { - SDValue LHSVal = getMemCmpLoad(LHS, LoadVT, LoadTy, *this); - SDValue RHSVal = getMemCmpLoad(RHS, LoadVT, LoadTy, *this); + return LVT; + }; - SDValue Res = DAG.getSetCC(getCurSDLoc(), MVT::i1, LHSVal, RHSVal, - ISD::SETNE); - processIntegerCallValue(I, Res, false); - return true; - } + // This turns into unaligned loads. We only do this if the target natively + // supports the MVT we'll be loading or if it is small enough (<= 4) that + // we'll only produce a small number of byte loads. + MVT LoadVT; + unsigned NumBitsToCompare = CSize->getZExtValue() * 8; + switch (NumBitsToCompare) { + default: + return false; + case 16: + LoadVT = MVT::i16; + break; + case 32: + LoadVT = MVT::i32; + break; + case 64: + case 128: + case 256: + LoadVT = hasFastLoadsAndCompare(NumBitsToCompare); + break; } + if (LoadVT == MVT::INVALID_SIMPLE_VALUE_TYPE) + return false; - return false; + SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this); + SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this); + + // Bitcast to a wide integer type if the loads are vectors. + if (LoadVT.isVector()) { + EVT CmpVT = EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits()); + LoadL = DAG.getBitcast(CmpVT, LoadL); + LoadR = DAG.getBitcast(CmpVT, LoadR); + } + + SDValue Cmp = DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE); + processIntegerCallValue(I, Cmp, false); + return true; } -/// visitMemChrCall -- See if we can lower a memchr call into an optimized -/// form. If so, return true and lower it, otherwise return false and it -/// will be lowered like a normal call. +/// See if we can lower a memchr call into an optimized form. If so, return +/// true and lower it. Otherwise return false, and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) { - // Verify that the prototype makes sense. void *memchr(void *, int, size_t) - if (I.getNumArgOperands() != 3) - return false; - const Value *Src = I.getArgOperand(0); const Value *Char = I.getArgOperand(1); const Value *Length = I.getArgOperand(2); - if (!Src->getType()->isPointerTy() || - !Char->getType()->isIntegerTy() || - !Length->getType()->isIntegerTy() || - !I.getType()->isPointerTy()) - return false; const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo(); std::pair<SDValue, SDValue> Res = @@ -6115,15 +6361,12 @@ bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) { return false; } -/// -/// visitMemPCpyCall -- lower a mempcpy call as a memcpy followed by code to -/// to adjust the dst pointer by the size of the copied memory. +/// See if we can lower a mempcpy call into an optimized form. If so, return +/// true and lower it. Otherwise return false, and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) { - - // Verify argument count: void *mempcpy(void *, const void *, size_t) - if (I.getNumArgOperands() != 3) - return false; - SDValue Dst = getValue(I.getArgOperand(0)); SDValue Src = getValue(I.getArgOperand(1)); SDValue Size = getValue(I.getArgOperand(2)); @@ -6158,19 +6401,13 @@ bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) { return true; } -/// visitStrCpyCall -- See if we can lower a strcpy or stpcpy call into an -/// optimized form. If so, return true and lower it, otherwise return false -/// and it will be lowered like a normal call. +/// See if we can lower a strcpy call into an optimized form. If so, return +/// true and lower it, otherwise return false and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) { - // Verify that the prototype makes sense. char *strcpy(char *, char *) - if (I.getNumArgOperands() != 2) - return false; - const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1); - if (!Arg0->getType()->isPointerTy() || - !Arg1->getType()->isPointerTy() || - !I.getType()->isPointerTy()) - return false; const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo(); std::pair<SDValue, SDValue> Res = @@ -6187,19 +6424,13 @@ bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) { return false; } -/// visitStrCmpCall - See if we can lower a call to strcmp in an optimized form. -/// If so, return true and lower it, otherwise return false and it will be -/// lowered like a normal call. +/// See if we can lower a strcmp call into an optimized form. If so, return +/// true and lower it, otherwise return false and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) { - // Verify that the prototype makes sense. int strcmp(void*,void*) - if (I.getNumArgOperands() != 2) - return false; - const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1); - if (!Arg0->getType()->isPointerTy() || - !Arg1->getType()->isPointerTy() || - !I.getType()->isIntegerTy()) - return false; const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo(); std::pair<SDValue, SDValue> Res = @@ -6216,17 +6447,13 @@ bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) { return false; } -/// visitStrLenCall -- See if we can lower a strlen call into an optimized -/// form. If so, return true and lower it, otherwise return false and it -/// will be lowered like a normal call. +/// See if we can lower a strlen call into an optimized form. If so, return +/// true and lower it, otherwise return false and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) { - // Verify that the prototype makes sense. size_t strlen(char *) - if (I.getNumArgOperands() != 1) - return false; - const Value *Arg0 = I.getArgOperand(0); - if (!Arg0->getType()->isPointerTy() || !I.getType()->isIntegerTy()) - return false; const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo(); std::pair<SDValue, SDValue> Res = @@ -6241,19 +6468,13 @@ bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) { return false; } -/// visitStrNLenCall -- See if we can lower a strnlen call into an optimized -/// form. If so, return true and lower it, otherwise return false and it -/// will be lowered like a normal call. +/// See if we can lower a strnlen call into an optimized form. If so, return +/// true and lower it, otherwise return false and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) { - // Verify that the prototype makes sense. size_t strnlen(char *, size_t) - if (I.getNumArgOperands() != 2) - return false; - const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1); - if (!Arg0->getType()->isPointerTy() || - !Arg1->getType()->isIntegerTy() || - !I.getType()->isIntegerTy()) - return false; const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo(); std::pair<SDValue, SDValue> Res = @@ -6269,16 +6490,15 @@ bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) { return false; } -/// visitUnaryFloatCall - If a call instruction is a unary floating-point -/// operation (as expected), translate it to an SDNode with the specified opcode -/// and return true. +/// See if we can lower a unary floating-point operation into an SDNode with +/// the specified Opcode. If so, return true and lower it, otherwise return +/// false and it will be lowered like a normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I, unsigned Opcode) { - // Sanity check that it really is a unary floating-point call. - if (I.getNumArgOperands() != 1 || - !I.getArgOperand(0)->getType()->isFloatingPointTy() || - I.getType() != I.getArgOperand(0)->getType() || - !I.onlyReadsMemory()) + // We already checked this call's prototype; verify it doesn't modify errno. + if (!I.onlyReadsMemory()) return false; SDValue Tmp = getValue(I.getArgOperand(0)); @@ -6286,17 +6506,15 @@ bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I, return true; } -/// visitBinaryFloatCall - If a call instruction is a binary floating-point -/// operation (as expected), translate it to an SDNode with the specified opcode -/// and return true. +/// See if we can lower a binary floating-point operation into an SDNode with +/// the specified Opcode. If so, return true and lower it. Otherwise return +/// false, and it will be lowered like a normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitBinaryFloatCall(const CallInst &I, unsigned Opcode) { - // Sanity check that it really is a binary floating-point call. - if (I.getNumArgOperands() != 2 || - !I.getArgOperand(0)->getType()->isFloatingPointTy() || - I.getType() != I.getArgOperand(0)->getType() || - I.getType() != I.getArgOperand(1)->getType() || - !I.onlyReadsMemory()) + // We already checked this call's prototype; verify it doesn't modify errno. + if (!I.onlyReadsMemory()) return false; SDValue Tmp0 = getValue(I.getArgOperand(0)); @@ -6336,20 +6554,18 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) { // Check for well-known libc/libm calls. If the function is internal, it // can't be a library call. Don't do the check if marked as nobuiltin for // some reason. - LibFunc::Func Func; + LibFunc Func; if (!I.isNoBuiltin() && !F->hasLocalLinkage() && F->hasName() && - LibInfo->getLibFunc(F->getName(), Func) && + LibInfo->getLibFunc(*F, Func) && LibInfo->hasOptimizedCodeGen(Func)) { switch (Func) { default: break; - case LibFunc::copysign: - case LibFunc::copysignf: - case LibFunc::copysignl: - if (I.getNumArgOperands() == 2 && // Basic sanity checks. - I.getArgOperand(0)->getType()->isFloatingPointTy() && - I.getType() == I.getArgOperand(0)->getType() && - I.getType() == I.getArgOperand(1)->getType() && - I.onlyReadsMemory()) { + case LibFunc_copysign: + case LibFunc_copysignf: + case LibFunc_copysignl: + // We already checked this call's prototype; verify it doesn't modify + // errno. + if (I.onlyReadsMemory()) { SDValue LHS = getValue(I.getArgOperand(0)); SDValue RHS = getValue(I.getArgOperand(1)); setValue(&I, DAG.getNode(ISD::FCOPYSIGN, getCurSDLoc(), @@ -6357,122 +6573,122 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) { return; } break; - case LibFunc::fabs: - case LibFunc::fabsf: - case LibFunc::fabsl: + case LibFunc_fabs: + case LibFunc_fabsf: + case LibFunc_fabsl: if (visitUnaryFloatCall(I, ISD::FABS)) return; break; - case LibFunc::fmin: - case LibFunc::fminf: - case LibFunc::fminl: + case LibFunc_fmin: + case LibFunc_fminf: + case LibFunc_fminl: if (visitBinaryFloatCall(I, ISD::FMINNUM)) return; break; - case LibFunc::fmax: - case LibFunc::fmaxf: - case LibFunc::fmaxl: + case LibFunc_fmax: + case LibFunc_fmaxf: + case LibFunc_fmaxl: if (visitBinaryFloatCall(I, ISD::FMAXNUM)) return; break; - case LibFunc::sin: - case LibFunc::sinf: - case LibFunc::sinl: + case LibFunc_sin: + case LibFunc_sinf: + case LibFunc_sinl: if (visitUnaryFloatCall(I, ISD::FSIN)) return; break; - case LibFunc::cos: - case LibFunc::cosf: - case LibFunc::cosl: + case LibFunc_cos: + case LibFunc_cosf: + case LibFunc_cosl: if (visitUnaryFloatCall(I, ISD::FCOS)) return; break; - case LibFunc::sqrt: - case LibFunc::sqrtf: - case LibFunc::sqrtl: - case LibFunc::sqrt_finite: - case LibFunc::sqrtf_finite: - case LibFunc::sqrtl_finite: + case LibFunc_sqrt: + case LibFunc_sqrtf: + case LibFunc_sqrtl: + case LibFunc_sqrt_finite: + case LibFunc_sqrtf_finite: + case LibFunc_sqrtl_finite: if (visitUnaryFloatCall(I, ISD::FSQRT)) return; break; - case LibFunc::floor: - case LibFunc::floorf: - case LibFunc::floorl: + case LibFunc_floor: + case LibFunc_floorf: + case LibFunc_floorl: if (visitUnaryFloatCall(I, ISD::FFLOOR)) return; break; - case LibFunc::nearbyint: - case LibFunc::nearbyintf: - case LibFunc::nearbyintl: + case LibFunc_nearbyint: + case LibFunc_nearbyintf: + case LibFunc_nearbyintl: if (visitUnaryFloatCall(I, ISD::FNEARBYINT)) return; break; - case LibFunc::ceil: - case LibFunc::ceilf: - case LibFunc::ceill: + case LibFunc_ceil: + case LibFunc_ceilf: + case LibFunc_ceill: if (visitUnaryFloatCall(I, ISD::FCEIL)) return; break; - case LibFunc::rint: - case LibFunc::rintf: - case LibFunc::rintl: + case LibFunc_rint: + case LibFunc_rintf: + case LibFunc_rintl: if (visitUnaryFloatCall(I, ISD::FRINT)) return; break; - case LibFunc::round: - case LibFunc::roundf: - case LibFunc::roundl: + case LibFunc_round: + case LibFunc_roundf: + case LibFunc_roundl: if (visitUnaryFloatCall(I, ISD::FROUND)) return; break; - case LibFunc::trunc: - case LibFunc::truncf: - case LibFunc::truncl: + case LibFunc_trunc: + case LibFunc_truncf: + case LibFunc_truncl: if (visitUnaryFloatCall(I, ISD::FTRUNC)) return; break; - case LibFunc::log2: - case LibFunc::log2f: - case LibFunc::log2l: + case LibFunc_log2: + case LibFunc_log2f: + case LibFunc_log2l: if (visitUnaryFloatCall(I, ISD::FLOG2)) return; break; - case LibFunc::exp2: - case LibFunc::exp2f: - case LibFunc::exp2l: + case LibFunc_exp2: + case LibFunc_exp2f: + case LibFunc_exp2l: if (visitUnaryFloatCall(I, ISD::FEXP2)) return; break; - case LibFunc::memcmp: + case LibFunc_memcmp: if (visitMemCmpCall(I)) return; break; - case LibFunc::mempcpy: + case LibFunc_mempcpy: if (visitMemPCpyCall(I)) return; break; - case LibFunc::memchr: + case LibFunc_memchr: if (visitMemChrCall(I)) return; break; - case LibFunc::strcpy: + case LibFunc_strcpy: if (visitStrCpyCall(I, false)) return; break; - case LibFunc::stpcpy: + case LibFunc_stpcpy: if (visitStrCpyCall(I, true)) return; break; - case LibFunc::strcmp: + case LibFunc_strcmp: if (visitStrCmpCall(I)) return; break; - case LibFunc::strlen: + case LibFunc_strlen: if (visitStrLenCall(I)) return; break; - case LibFunc::strnlen: + case LibFunc_strnlen: if (visitStrNLenCall(I)) return; break; @@ -6648,7 +6864,7 @@ static SDValue getAddressForMemoryInput(SDValue Chain, const SDLoc &Location, unsigned Align = DL.getPrefTypeAlignment(Ty); MachineFunction &MF = DAG.getMachineFunction(); int SSFI = MF.getFrameInfo().CreateStackObject(TySize, Align, false); - SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI.getPointerTy(DL)); + SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI.getFrameIndexTy(DL)); Chain = DAG.getStore(Chain, Location, OpInfo.CallOperand, StackSlot, MachinePointerInfo::getFixedStack(MF, SSFI)); OpInfo.CallOperand = StackSlot; @@ -6671,12 +6887,12 @@ static void GetRegistersForValue(SelectionDAG &DAG, const TargetLowering &TLI, MachineFunction &MF = DAG.getMachineFunction(); SmallVector<unsigned, 4> Regs; + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); // If this is a constraint for a single physreg, or a constraint for a // register class, find it. std::pair<unsigned, const TargetRegisterClass *> PhysReg = - TLI.getRegForInlineAsmConstraint(MF.getSubtarget().getRegisterInfo(), - OpInfo.ConstraintCode, + TLI.getRegForInlineAsmConstraint(&TRI, OpInfo.ConstraintCode, OpInfo.ConstraintVT); unsigned NumRegs = 1; @@ -6684,12 +6900,12 @@ static void GetRegistersForValue(SelectionDAG &DAG, const TargetLowering &TLI, // If this is a FP input in an integer register (or visa versa) insert a bit // cast of the input value. More generally, handle any case where the input // value disagrees with the register class we plan to stick this in. - if (OpInfo.Type == InlineAsm::isInput && - PhysReg.second && !PhysReg.second->hasType(OpInfo.ConstraintVT)) { + if (OpInfo.Type == InlineAsm::isInput && PhysReg.second && + !TRI.isTypeLegalForClass(*PhysReg.second, OpInfo.ConstraintVT)) { // Try to convert to the first EVT that the reg class contains. If the // types are identical size, use a bitcast to convert (e.g. two differing // vector types). - MVT RegVT = *PhysReg.second->vt_begin(); + MVT RegVT = *TRI.legalclasstypes_begin(*PhysReg.second); if (RegVT.getSizeInBits() == OpInfo.CallOperand.getValueSizeInBits()) { OpInfo.CallOperand = DAG.getNode(ISD::BITCAST, DL, RegVT, OpInfo.CallOperand); @@ -6717,12 +6933,12 @@ static void GetRegistersForValue(SelectionDAG &DAG, const TargetLowering &TLI, if (unsigned AssignedReg = PhysReg.first) { const TargetRegisterClass *RC = PhysReg.second; if (OpInfo.ConstraintVT == MVT::Other) - ValueVT = *RC->vt_begin(); + ValueVT = *TRI.legalclasstypes_begin(*RC); // Get the actual register value type. This is important, because the user // may have asked for (e.g.) the AX register in i32 type. We need to // remember that AX is actually i16 to get the right extension. - RegVT = *RC->vt_begin(); + RegVT = *TRI.legalclasstypes_begin(*RC); // This is a explicit reference to a physical register. Regs.push_back(AssignedReg); @@ -6748,7 +6964,7 @@ static void GetRegistersForValue(SelectionDAG &DAG, const TargetLowering &TLI, // Otherwise, if this was a reference to an LLVM register class, create vregs // for this reference. if (const TargetRegisterClass *RC = PhysReg.second) { - RegVT = *RC->vt_begin(); + RegVT = *TRI.legalclasstypes_begin(*RC); if (OpInfo.ConstraintVT == MVT::Other) ValueVT = RegVT; @@ -7085,8 +7301,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) { SDLoc dl = getCurSDLoc(); // Use the produced MatchedRegs object to - MatchedRegs.getCopyToRegs(InOperandVal, DAG, dl, - Chain, &Flag, CS.getInstruction()); + MatchedRegs.getCopyToRegs(InOperandVal, DAG, dl, Chain, &Flag, + CS.getInstruction()); MatchedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse, true, OpInfo.getMatchedOperand(), dl, DAG, AsmNodeOperands); @@ -7361,7 +7577,7 @@ void SelectionDAGBuilder::populateCallLoweringInfo( // Populate the argument list. // Attributes for args start at offset 1, after the return attribute. - for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs, AttrI = ArgIdx + 1; + for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs; ArgI != ArgE; ++ArgI) { const Value *V = CS->getOperand(ArgI); @@ -7370,7 +7586,7 @@ void SelectionDAGBuilder::populateCallLoweringInfo( TargetLowering::ArgListEntry Entry; Entry.Node = getValue(V); Entry.Ty = V->getType(); - Entry.setAttributes(&CS, AttrI); + Entry.setAttributes(&CS, ArgIdx); Args.push_back(Entry); } @@ -7411,7 +7627,7 @@ static void addStackMapLiveVars(ImmutableCallSite CS, unsigned StartIdx, } else if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(OpVal)) { const TargetLowering &TLI = Builder.DAG.getTargetLoweringInfo(); Ops.push_back(Builder.DAG.getTargetFrameIndex( - FI->getIndex(), TLI.getPointerTy(Builder.DAG.getDataLayout()))); + FI->getIndex(), TLI.getFrameIndexTy(Builder.DAG.getDataLayout()))); } else Ops.push_back(OpVal); } @@ -7437,11 +7653,11 @@ void SelectionDAGBuilder::visitStackmap(const CallInst &CI) { // have to worry about calling conventions and target specific lowering code. // Instead we perform the call lowering right here. // - // chain, flag = CALLSEQ_START(chain, 0) + // chain, flag = CALLSEQ_START(chain, 0, 0) // chain, flag = STACKMAP(id, nbytes, ..., chain, flag) // chain, flag = CALLSEQ_END(chain, 0, 0, flag) // - Chain = DAG.getCALLSEQ_START(getRoot(), NullPtr, DL); + Chain = DAG.getCALLSEQ_START(getRoot(), 0, 0, DL); InFlag = Chain.getValue(1); // Add the <id> and <numBytes> constants. @@ -7631,9 +7847,79 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS, FuncInfo.MF->getFrameInfo().setHasPatchPoint(); } -/// Returns an AttributeSet representing the attributes applied to the return +void SelectionDAGBuilder::visitVectorReduce(const CallInst &I, + unsigned Intrinsic) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Op1 = getValue(I.getArgOperand(0)); + SDValue Op2; + if (I.getNumArgOperands() > 1) + Op2 = getValue(I.getArgOperand(1)); + SDLoc dl = getCurSDLoc(); + EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + SDValue Res; + FastMathFlags FMF; + if (isa<FPMathOperator>(I)) + FMF = I.getFastMathFlags(); + SDNodeFlags SDFlags; + SDFlags.setNoNaNs(FMF.noNaNs()); + + switch (Intrinsic) { + case Intrinsic::experimental_vector_reduce_fadd: + if (FMF.unsafeAlgebra()) + Res = DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2); + else + Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2); + break; + case Intrinsic::experimental_vector_reduce_fmul: + if (FMF.unsafeAlgebra()) + Res = DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2); + else + Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2); + break; + case Intrinsic::experimental_vector_reduce_add: + Res = DAG.getNode(ISD::VECREDUCE_ADD, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_mul: + Res = DAG.getNode(ISD::VECREDUCE_MUL, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_and: + Res = DAG.getNode(ISD::VECREDUCE_AND, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_or: + Res = DAG.getNode(ISD::VECREDUCE_OR, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_xor: + Res = DAG.getNode(ISD::VECREDUCE_XOR, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_smax: + Res = DAG.getNode(ISD::VECREDUCE_SMAX, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_smin: + Res = DAG.getNode(ISD::VECREDUCE_SMIN, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_umax: + Res = DAG.getNode(ISD::VECREDUCE_UMAX, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_umin: + Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_fmax: { + Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1, SDFlags); + break; + } + case Intrinsic::experimental_vector_reduce_fmin: { + Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags); + break; + } + default: + llvm_unreachable("Unhandled vector reduce intrinsic"); + } + setValue(&I, Res); +} + +/// Returns an AttributeList representing the attributes applied to the return /// value of the given call. -static AttributeSet getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) { +static AttributeList getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) { SmallVector<Attribute::AttrKind, 2> Attrs; if (CLI.RetSExt) Attrs.push_back(Attribute::SExt); @@ -7642,8 +7928,8 @@ static AttributeSet getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) { if (CLI.IsInReg) Attrs.push_back(Attribute::InReg); - return AttributeSet::get(CLI.RetTy->getContext(), AttributeSet::ReturnIndex, - Attrs); + return AttributeList::get(CLI.RetTy->getContext(), AttributeList::ReturnIndex, + Attrs); } /// TargetLowering::LowerCallTo - This is the default LowerCallTo @@ -7660,6 +7946,22 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { auto &DL = CLI.DAG.getDataLayout(); ComputeValueVTs(*this, DL, CLI.RetTy, RetTys, &Offsets); + if (CLI.IsPostTypeLegalization) { + // If we are lowering a libcall after legalization, split the return type. + SmallVector<EVT, 4> OldRetTys = std::move(RetTys); + SmallVector<uint64_t, 4> OldOffsets = std::move(Offsets); + for (size_t i = 0, e = OldRetTys.size(); i != e; ++i) { + EVT RetVT = OldRetTys[i]; + uint64_t Offset = OldOffsets[i]; + MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), RetVT); + unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), RetVT); + unsigned RegisterVTSize = RegisterVT.getSizeInBits(); + RetTys.append(NumRegs, RegisterVT); + for (unsigned j = 0; j != NumRegs; ++j) + Offsets.push_back(Offset + j * RegisterVTSize); + } + } + SmallVector<ISD::OutputArg, 4> Outs; GetReturnInfo(CLI.RetTy, getReturnAttrs(CLI), Outs, *this, DL); @@ -7679,19 +7981,19 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { DemoteStackIdx = MF.getFrameInfo().CreateStackObject(TySize, Align, false); Type *StackSlotPtrType = PointerType::getUnqual(CLI.RetTy); - DemoteStackSlot = CLI.DAG.getFrameIndex(DemoteStackIdx, getPointerTy(DL)); + DemoteStackSlot = CLI.DAG.getFrameIndex(DemoteStackIdx, getFrameIndexTy(DL)); ArgListEntry Entry; Entry.Node = DemoteStackSlot; Entry.Ty = StackSlotPtrType; - Entry.isSExt = false; - Entry.isZExt = false; - Entry.isInReg = false; - Entry.isSRet = true; - Entry.isNest = false; - Entry.isByVal = false; - Entry.isReturned = false; - Entry.isSwiftSelf = false; - Entry.isSwiftError = false; + Entry.IsSExt = false; + Entry.IsZExt = false; + Entry.IsInReg = false; + Entry.IsSRet = true; + Entry.IsNest = false; + Entry.IsByVal = false; + Entry.IsReturned = false; + Entry.IsSwiftSelf = false; + Entry.IsSwiftError = false; Entry.Alignment = Align; CLI.getArgs().insert(CLI.getArgs().begin(), Entry); CLI.RetTy = Type::getVoidTy(CLI.RetTy->getContext()); @@ -7702,8 +8004,10 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { } else { for (unsigned I = 0, E = RetTys.size(); I != E; ++I) { EVT VT = RetTys[I]; - MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT); - unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT); + MVT RegisterVT = + getRegisterTypeForCallingConv(CLI.RetTy->getContext(), VT); + unsigned NumRegs = + getNumRegistersForCallingConv(CLI.RetTy->getContext(), VT); for (unsigned i = 0; i != NumRegs; ++i) { ISD::InputArg MyFlags; MyFlags.VT = RegisterVT; @@ -7724,7 +8028,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { ArgListTy &Args = CLI.getArgs(); if (supportSwiftError()) { for (unsigned i = 0, e = Args.size(); i != e; ++i) { - if (Args[i].isSwiftError) { + if (Args[i].IsSwiftError) { ISD::InputArg MyFlags; MyFlags.VT = getPointerTy(DL); MyFlags.ArgVT = EVT(getPointerTy(DL)); @@ -7740,8 +8044,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { for (unsigned i = 0, e = Args.size(); i != e; ++i) { SmallVector<EVT, 4> ValueVTs; ComputeValueVTs(*this, DL, Args[i].Ty, ValueVTs); + // FIXME: Split arguments if CLI.IsPostTypeLegalization Type *FinalType = Args[i].Ty; - if (Args[i].isByVal) + if (Args[i].IsByVal) FinalType = cast<PointerType>(Args[i].Ty)->getElementType(); bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters( FinalType, CLI.CallConv, CLI.IsVarArg); @@ -7752,13 +8057,17 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { SDValue Op = SDValue(Args[i].Node.getNode(), Args[i].Node.getResNo() + Value); ISD::ArgFlagsTy Flags; - unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy); - if (Args[i].isZExt) + // Certain targets (such as MIPS), may have a different ABI alignment + // for a type depending on the context. Give the target a chance to + // specify the alignment it wants. + unsigned OriginalAlignment = getABIAlignmentForCallingConv(ArgTy, DL); + + if (Args[i].IsZExt) Flags.setZExt(); - if (Args[i].isSExt) + if (Args[i].IsSExt) Flags.setSExt(); - if (Args[i].isInReg) { + if (Args[i].IsInReg) { // If we are using vectorcall calling convention, a structure that is // passed InReg - is surely an HVA if (CLI.CallConv == CallingConv::X86_VectorCall && @@ -7771,15 +8080,15 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { // Set InReg Flag Flags.setInReg(); } - if (Args[i].isSRet) + if (Args[i].IsSRet) Flags.setSRet(); - if (Args[i].isSwiftSelf) + if (Args[i].IsSwiftSelf) Flags.setSwiftSelf(); - if (Args[i].isSwiftError) + if (Args[i].IsSwiftError) Flags.setSwiftError(); - if (Args[i].isByVal) + if (Args[i].IsByVal) Flags.setByVal(); - if (Args[i].isInAlloca) { + if (Args[i].IsInAlloca) { Flags.setInAlloca(); // Set the byval flag for CCAssignFn callbacks that don't know about // inalloca. This way we can know how many bytes we should've allocated @@ -7788,7 +8097,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { // in the various CC lowering callbacks. Flags.setByVal(); } - if (Args[i].isByVal || Args[i].isInAlloca) { + if (Args[i].IsByVal || Args[i].IsInAlloca) { PointerType *Ty = cast<PointerType>(Args[i].Ty); Type *ElementTy = Ty->getElementType(); Flags.setByValSize(DL.getTypeAllocSize(ElementTy)); @@ -7801,24 +8110,25 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { FrameAlign = getByValTypeAlignment(ElementTy, DL); Flags.setByValAlign(FrameAlign); } - if (Args[i].isNest) + if (Args[i].IsNest) Flags.setNest(); if (NeedsRegBlock) Flags.setInConsecutiveRegs(); Flags.setOrigAlign(OriginalAlignment); - MVT PartVT = getRegisterType(CLI.RetTy->getContext(), VT); - unsigned NumParts = getNumRegisters(CLI.RetTy->getContext(), VT); + MVT PartVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(), VT); + unsigned NumParts = + getNumRegistersForCallingConv(CLI.RetTy->getContext(), VT); SmallVector<SDValue, 4> Parts(NumParts); ISD::NodeType ExtendKind = ISD::ANY_EXTEND; - if (Args[i].isSExt) + if (Args[i].IsSExt) ExtendKind = ISD::SIGN_EXTEND; - else if (Args[i].isZExt) + else if (Args[i].IsZExt) ExtendKind = ISD::ZERO_EXTEND; // Conservatively only handle 'returned' on non-vectors for now - if (Args[i].isReturned && !Op.getValueType().isVector()) { + if (Args[i].IsReturned && !Op.getValueType().isVector()) { assert(CLI.RetTy == Args[i].Ty && RetTys.size() == NumValues && "unexpected use of 'returned'"); // Before passing 'returned' to the target lowering code, ensure that @@ -7832,13 +8142,14 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { // parameter extension method is not compatible with the return // extension method if ((NumParts * PartVT.getSizeInBits() == VT.getSizeInBits()) || - (ExtendKind != ISD::ANY_EXTEND && - CLI.RetSExt == Args[i].isSExt && CLI.RetZExt == Args[i].isZExt)) - Flags.setReturned(); + (ExtendKind != ISD::ANY_EXTEND && CLI.RetSExt == Args[i].IsSExt && + CLI.RetZExt == Args[i].IsZExt)) + Flags.setReturned(); } getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts, PartVT, - CLI.CS ? CLI.CS->getInstruction() : nullptr, ExtendKind); + CLI.CS ? CLI.CS->getInstruction() : nullptr, ExtendKind, + true); for (unsigned j = 0; j != NumParts; ++j) { // if it isn't first piece, alignment must be 1 @@ -7916,7 +8227,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { for (unsigned i = 0; i < NumValues; ++i) { SDValue Add = CLI.DAG.getNode(ISD::ADD, CLI.DL, PtrVT, DemoteStackSlot, CLI.DAG.getConstant(Offsets[i], CLI.DL, - PtrVT), &Flags); + PtrVT), Flags); SDValue L = CLI.DAG.getLoad( RetTys[i], CLI.DL, CLI.Chain, Add, MachinePointerInfo::getFixedStack(CLI.DAG.getMachineFunction(), @@ -7938,12 +8249,14 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { unsigned CurReg = 0; for (unsigned I = 0, E = RetTys.size(); I != E; ++I) { EVT VT = RetTys[I]; - MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT); - unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT); + MVT RegisterVT = + getRegisterTypeForCallingConv(CLI.RetTy->getContext(), VT); + unsigned NumRegs = + getNumRegistersForCallingConv(CLI.RetTy->getContext(), VT); ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg], NumRegs, RegisterVT, VT, nullptr, - AssertOp)); + AssertOp, true)); CurReg += NumRegs; } @@ -7979,8 +8292,11 @@ SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) { assert(!TargetRegisterInfo::isPhysicalRegister(Reg) && "Is a physreg"); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + // If this is an InlineAsm we have to match the registers required, not the + // notional registers required by the type. + RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), Reg, - V->getType()); + V->getType(), isABIRegCopy(V)); SDValue Chain = DAG.getEntryNode(); ISD::NodeType ExtendType = (FuncInfo.PreferredExtendType.find(V) == @@ -8010,6 +8326,173 @@ static bool isOnlyUsedInEntryBlock(const Argument *A, bool FastISel) { return true; } +typedef DenseMap<const Argument *, + std::pair<const AllocaInst *, const StoreInst *>> + ArgCopyElisionMapTy; + +/// Scan the entry block of the function in FuncInfo for arguments that look +/// like copies into a local alloca. Record any copied arguments in +/// ArgCopyElisionCandidates. +static void +findArgumentCopyElisionCandidates(const DataLayout &DL, + FunctionLoweringInfo *FuncInfo, + ArgCopyElisionMapTy &ArgCopyElisionCandidates) { + // Record the state of every static alloca used in the entry block. Argument + // allocas are all used in the entry block, so we need approximately as many + // entries as we have arguments. + enum StaticAllocaInfo { Unknown, Clobbered, Elidable }; + SmallDenseMap<const AllocaInst *, StaticAllocaInfo, 8> StaticAllocas; + unsigned NumArgs = FuncInfo->Fn->arg_size(); + StaticAllocas.reserve(NumArgs * 2); + + auto GetInfoIfStaticAlloca = [&](const Value *V) -> StaticAllocaInfo * { + if (!V) + return nullptr; + V = V->stripPointerCasts(); + const auto *AI = dyn_cast<AllocaInst>(V); + if (!AI || !AI->isStaticAlloca() || !FuncInfo->StaticAllocaMap.count(AI)) + return nullptr; + auto Iter = StaticAllocas.insert({AI, Unknown}); + return &Iter.first->second; + }; + + // Look for stores of arguments to static allocas. Look through bitcasts and + // GEPs to handle type coercions, as long as the alloca is fully initialized + // by the store. Any non-store use of an alloca escapes it and any subsequent + // unanalyzed store might write it. + // FIXME: Handle structs initialized with multiple stores. + for (const Instruction &I : FuncInfo->Fn->getEntryBlock()) { + // Look for stores, and handle non-store uses conservatively. + const auto *SI = dyn_cast<StoreInst>(&I); + if (!SI) { + // We will look through cast uses, so ignore them completely. + if (I.isCast()) + continue; + // Ignore debug info intrinsics, they don't escape or store to allocas. + if (isa<DbgInfoIntrinsic>(I)) + continue; + // This is an unknown instruction. Assume it escapes or writes to all + // static alloca operands. + for (const Use &U : I.operands()) { + if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(U)) + *Info = StaticAllocaInfo::Clobbered; + } + continue; + } + + // If the stored value is a static alloca, mark it as escaped. + if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(SI->getValueOperand())) + *Info = StaticAllocaInfo::Clobbered; + + // Check if the destination is a static alloca. + const Value *Dst = SI->getPointerOperand()->stripPointerCasts(); + StaticAllocaInfo *Info = GetInfoIfStaticAlloca(Dst); + if (!Info) + continue; + const AllocaInst *AI = cast<AllocaInst>(Dst); + + // Skip allocas that have been initialized or clobbered. + if (*Info != StaticAllocaInfo::Unknown) + continue; + + // Check if the stored value is an argument, and that this store fully + // initializes the alloca. Don't elide copies from the same argument twice. + const Value *Val = SI->getValueOperand()->stripPointerCasts(); + const auto *Arg = dyn_cast<Argument>(Val); + if (!Arg || Arg->hasInAllocaAttr() || Arg->hasByValAttr() || + Arg->getType()->isEmptyTy() || + DL.getTypeStoreSize(Arg->getType()) != + DL.getTypeAllocSize(AI->getAllocatedType()) || + ArgCopyElisionCandidates.count(Arg)) { + *Info = StaticAllocaInfo::Clobbered; + continue; + } + + DEBUG(dbgs() << "Found argument copy elision candidate: " << *AI << '\n'); + + // Mark this alloca and store for argument copy elision. + *Info = StaticAllocaInfo::Elidable; + ArgCopyElisionCandidates.insert({Arg, {AI, SI}}); + + // Stop scanning if we've seen all arguments. This will happen early in -O0 + // builds, which is useful, because -O0 builds have large entry blocks and + // many allocas. + if (ArgCopyElisionCandidates.size() == NumArgs) + break; + } +} + +/// Try to elide argument copies from memory into a local alloca. Succeeds if +/// ArgVal is a load from a suitable fixed stack object. +static void tryToElideArgumentCopy( + FunctionLoweringInfo *FuncInfo, SmallVectorImpl<SDValue> &Chains, + DenseMap<int, int> &ArgCopyElisionFrameIndexMap, + SmallPtrSetImpl<const Instruction *> &ElidedArgCopyInstrs, + ArgCopyElisionMapTy &ArgCopyElisionCandidates, const Argument &Arg, + SDValue ArgVal, bool &ArgHasUses) { + // Check if this is a load from a fixed stack object. + auto *LNode = dyn_cast<LoadSDNode>(ArgVal); + if (!LNode) + return; + auto *FINode = dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()); + if (!FINode) + return; + + // Check that the fixed stack object is the right size and alignment. + // Look at the alignment that the user wrote on the alloca instead of looking + // at the stack object. + auto ArgCopyIter = ArgCopyElisionCandidates.find(&Arg); + assert(ArgCopyIter != ArgCopyElisionCandidates.end()); + const AllocaInst *AI = ArgCopyIter->second.first; + int FixedIndex = FINode->getIndex(); + int &AllocaIndex = FuncInfo->StaticAllocaMap[AI]; + int OldIndex = AllocaIndex; + MachineFrameInfo &MFI = FuncInfo->MF->getFrameInfo(); + if (MFI.getObjectSize(FixedIndex) != MFI.getObjectSize(OldIndex)) { + DEBUG(dbgs() << " argument copy elision failed due to bad fixed stack " + "object size\n"); + return; + } + unsigned RequiredAlignment = AI->getAlignment(); + if (!RequiredAlignment) { + RequiredAlignment = FuncInfo->MF->getDataLayout().getABITypeAlignment( + AI->getAllocatedType()); + } + if (MFI.getObjectAlignment(FixedIndex) < RequiredAlignment) { + DEBUG(dbgs() << " argument copy elision failed: alignment of alloca " + "greater than stack argument alignment (" + << RequiredAlignment << " vs " + << MFI.getObjectAlignment(FixedIndex) << ")\n"); + return; + } + + // Perform the elision. Delete the old stack object and replace its only use + // in the variable info map. Mark the stack object as mutable. + DEBUG({ + dbgs() << "Eliding argument copy from " << Arg << " to " << *AI << '\n' + << " Replacing frame index " << OldIndex << " with " << FixedIndex + << '\n'; + }); + MFI.RemoveStackObject(OldIndex); + MFI.setIsImmutableObjectIndex(FixedIndex, false); + AllocaIndex = FixedIndex; + ArgCopyElisionFrameIndexMap.insert({OldIndex, FixedIndex}); + Chains.push_back(ArgVal.getValue(1)); + + // Avoid emitting code for the store implementing the copy. + const StoreInst *SI = ArgCopyIter->second.second; + ElidedArgCopyInstrs.insert(SI); + + // Check for uses of the argument again so that we can avoid exporting ArgVal + // if it is't used by anything other than the store. + for (const Value *U : Arg.users()) { + if (U != SI) { + ArgHasUses = true; + break; + } + } +} + void SelectionDAGISel::LowerArguments(const Function &F) { SelectionDAG &DAG = SDB->DAG; SDLoc dl = SDB->getCurSDLoc(); @@ -8032,16 +8515,21 @@ void SelectionDAGISel::LowerArguments(const Function &F) { Ins.push_back(RetArg); } + // Look for stores of arguments to static allocas. Mark such arguments with a + // flag to ask the target to give us the memory location of that argument if + // available. + ArgCopyElisionMapTy ArgCopyElisionCandidates; + findArgumentCopyElisionCandidates(DL, FuncInfo, ArgCopyElisionCandidates); + // Set up the incoming argument description vector. - unsigned Idx = 1; - for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); - I != E; ++I, ++Idx) { + for (const Argument &Arg : F.args()) { + unsigned ArgNo = Arg.getArgNo(); SmallVector<EVT, 4> ValueVTs; - ComputeValueVTs(*TLI, DAG.getDataLayout(), I->getType(), ValueVTs); - bool isArgValueUsed = !I->use_empty(); + ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs); + bool isArgValueUsed = !Arg.use_empty(); unsigned PartBase = 0; - Type *FinalType = I->getType(); - if (F.getAttributes().hasAttribute(Idx, Attribute::ByVal)) + Type *FinalType = Arg.getType(); + if (Arg.hasAttribute(Attribute::ByVal)) FinalType = cast<PointerType>(FinalType)->getElementType(); bool NeedsRegBlock = TLI->functionArgumentNeedsConsecutiveRegisters( FinalType, F.getCallingConv(), F.isVarArg()); @@ -8050,17 +8538,22 @@ void SelectionDAGISel::LowerArguments(const Function &F) { EVT VT = ValueVTs[Value]; Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); ISD::ArgFlagsTy Flags; - unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy); - if (F.getAttributes().hasAttribute(Idx, Attribute::ZExt)) + // Certain targets (such as MIPS), may have a different ABI alignment + // for a type depending on the context. Give the target a chance to + // specify the alignment it wants. + unsigned OriginalAlignment = + TLI->getABIAlignmentForCallingConv(ArgTy, DL); + + if (Arg.hasAttribute(Attribute::ZExt)) Flags.setZExt(); - if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) + if (Arg.hasAttribute(Attribute::SExt)) Flags.setSExt(); - if (F.getAttributes().hasAttribute(Idx, Attribute::InReg)) { + if (Arg.hasAttribute(Attribute::InReg)) { // If we are using vectorcall calling convention, a structure that is // passed InReg - is surely an HVA if (F.getCallingConv() == CallingConv::X86_VectorCall && - isa<StructType>(I->getType())) { + isa<StructType>(Arg.getType())) { // The first value of a structure is marked if (0 == Value) Flags.setHvaStart(); @@ -8069,15 +8562,15 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // Set InReg Flag Flags.setInReg(); } - if (F.getAttributes().hasAttribute(Idx, Attribute::StructRet)) + if (Arg.hasAttribute(Attribute::StructRet)) Flags.setSRet(); - if (F.getAttributes().hasAttribute(Idx, Attribute::SwiftSelf)) + if (Arg.hasAttribute(Attribute::SwiftSelf)) Flags.setSwiftSelf(); - if (F.getAttributes().hasAttribute(Idx, Attribute::SwiftError)) + if (Arg.hasAttribute(Attribute::SwiftError)) Flags.setSwiftError(); - if (F.getAttributes().hasAttribute(Idx, Attribute::ByVal)) + if (Arg.hasAttribute(Attribute::ByVal)) Flags.setByVal(); - if (F.getAttributes().hasAttribute(Idx, Attribute::InAlloca)) { + if (Arg.hasAttribute(Attribute::InAlloca)) { Flags.setInAlloca(); // Set the byval flag for CCAssignFn callbacks that don't know about // inalloca. This way we can know how many bytes we should've allocated @@ -8088,33 +8581,37 @@ void SelectionDAGISel::LowerArguments(const Function &F) { } if (F.getCallingConv() == CallingConv::X86_INTR) { // IA Interrupt passes frame (1st parameter) by value in the stack. - if (Idx == 1) + if (ArgNo == 0) Flags.setByVal(); } if (Flags.isByVal() || Flags.isInAlloca()) { - PointerType *Ty = cast<PointerType>(I->getType()); + PointerType *Ty = cast<PointerType>(Arg.getType()); Type *ElementTy = Ty->getElementType(); Flags.setByValSize(DL.getTypeAllocSize(ElementTy)); // For ByVal, alignment should be passed from FE. BE will guess if // this info is not there but there are cases it cannot get right. unsigned FrameAlign; - if (F.getParamAlignment(Idx)) - FrameAlign = F.getParamAlignment(Idx); + if (Arg.getParamAlignment()) + FrameAlign = Arg.getParamAlignment(); else FrameAlign = TLI->getByValTypeAlignment(ElementTy, DL); Flags.setByValAlign(FrameAlign); } - if (F.getAttributes().hasAttribute(Idx, Attribute::Nest)) + if (Arg.hasAttribute(Attribute::Nest)) Flags.setNest(); if (NeedsRegBlock) Flags.setInConsecutiveRegs(); Flags.setOrigAlign(OriginalAlignment); + if (ArgCopyElisionCandidates.count(&Arg)) + Flags.setCopyElisionCandidate(); - MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT); - unsigned NumRegs = TLI->getNumRegisters(*CurDAG->getContext(), VT); + MVT RegisterVT = + TLI->getRegisterTypeForCallingConv(*CurDAG->getContext(), VT); + unsigned NumRegs = + TLI->getNumRegistersForCallingConv(*CurDAG->getContext(), VT); for (unsigned i = 0; i != NumRegs; ++i) { ISD::InputArg MyFlags(Flags, RegisterVT, VT, isArgValueUsed, - Idx-1, PartBase+i*RegisterVT.getStoreSize()); + ArgNo, PartBase+i*RegisterVT.getStoreSize()); if (NumRegs > 1 && i == 0) MyFlags.Flags.setSplit(); // if it isn't first piece, alignment must be 1 @@ -8155,7 +8652,6 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // Set up the argument values. unsigned i = 0; - Idx = 1; if (!FuncInfo->CanLowerReturn) { // Create a virtual register for the sret pointer, and put in a copy // from the sret argument into it. @@ -8177,49 +8673,63 @@ void SelectionDAGISel::LowerArguments(const Function &F) { DAG.setRoot(NewRoot); // i indexes lowered arguments. Bump it past the hidden sret argument. - // Idx indexes LLVM arguments. Don't touch it. ++i; } - for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; - ++I, ++Idx) { + SmallVector<SDValue, 4> Chains; + DenseMap<int, int> ArgCopyElisionFrameIndexMap; + for (const Argument &Arg : F.args()) { SmallVector<SDValue, 4> ArgValues; SmallVector<EVT, 4> ValueVTs; - ComputeValueVTs(*TLI, DAG.getDataLayout(), I->getType(), ValueVTs); + ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs); unsigned NumValues = ValueVTs.size(); + if (NumValues == 0) + continue; + + bool ArgHasUses = !Arg.use_empty(); + + // Elide the copying store if the target loaded this argument from a + // suitable fixed stack object. + if (Ins[i].Flags.isCopyElisionCandidate()) { + tryToElideArgumentCopy(FuncInfo, Chains, ArgCopyElisionFrameIndexMap, + ElidedArgCopyInstrs, ArgCopyElisionCandidates, Arg, + InVals[i], ArgHasUses); + } // If this argument is unused then remember its value. It is used to generate // debugging information. bool isSwiftErrorArg = TLI->supportSwiftError() && - F.getAttributes().hasAttribute(Idx, Attribute::SwiftError); - if (I->use_empty() && NumValues && !isSwiftErrorArg) { - SDB->setUnusedArgValue(&*I, InVals[i]); + Arg.hasAttribute(Attribute::SwiftError); + if (!ArgHasUses && !isSwiftErrorArg) { + SDB->setUnusedArgValue(&Arg, InVals[i]); // Also remember any frame index for use in FastISel. if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(InVals[i].getNode())) - FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex()); + FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex()); } for (unsigned Val = 0; Val != NumValues; ++Val) { EVT VT = ValueVTs[Val]; - MVT PartVT = TLI->getRegisterType(*CurDAG->getContext(), VT); - unsigned NumParts = TLI->getNumRegisters(*CurDAG->getContext(), VT); + MVT PartVT = + TLI->getRegisterTypeForCallingConv(*CurDAG->getContext(), VT); + unsigned NumParts = + TLI->getNumRegistersForCallingConv(*CurDAG->getContext(), VT); // Even an apparant 'unused' swifterror argument needs to be returned. So // we do generate a copy for it that can be used on return from the // function. - if (!I->use_empty() || isSwiftErrorArg) { + if (ArgHasUses || isSwiftErrorArg) { Optional<ISD::NodeType> AssertOp; - if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) + if (Arg.hasAttribute(Attribute::SExt)) AssertOp = ISD::AssertSext; - else if (F.getAttributes().hasAttribute(Idx, Attribute::ZExt)) + else if (Arg.hasAttribute(Attribute::ZExt)) AssertOp = ISD::AssertZext; - ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], - NumParts, PartVT, VT, - nullptr, AssertOp)); + ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], NumParts, + PartVT, VT, nullptr, AssertOp, + true)); } i += NumParts; @@ -8232,18 +8742,18 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // Note down frame index. if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode())) - FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex()); + FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex()); SDValue Res = DAG.getMergeValues(makeArrayRef(ArgValues.data(), NumValues), SDB->getCurSDLoc()); - SDB->setValue(&*I, Res); + SDB->setValue(&Arg, Res); if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::BUILD_PAIR) { if (LoadSDNode *LNode = dyn_cast<LoadSDNode>(Res.getOperand(0).getNode())) if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode())) - FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex()); + FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex()); } // Update the SwiftErrorVRegDefMap. @@ -8263,18 +8773,36 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // uses with vregs. unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg(); if (TargetRegisterInfo::isVirtualRegister(Reg)) { - FuncInfo->ValueMap[&*I] = Reg; + FuncInfo->ValueMap[&Arg] = Reg; continue; } } - if (!isOnlyUsedInEntryBlock(&*I, TM.Options.EnableFastISel)) { - FuncInfo->InitializeRegForValue(&*I); - SDB->CopyToExportRegsIfNeeded(&*I); + if (!isOnlyUsedInEntryBlock(&Arg, TM.Options.EnableFastISel)) { + FuncInfo->InitializeRegForValue(&Arg); + SDB->CopyToExportRegsIfNeeded(&Arg); } } + if (!Chains.empty()) { + Chains.push_back(NewRoot); + NewRoot = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); + } + + DAG.setRoot(NewRoot); + assert(i == InVals.size() && "Argument register count mismatch!"); + // If any argument copy elisions occurred and we have debug info, update the + // stale frame indices used in the dbg.declare variable info table. + MachineFunction::VariableDbgInfoMapTy &DbgDeclareInfo = MF->getVariableDbgInfo(); + if (!DbgDeclareInfo.empty() && !ArgCopyElisionFrameIndexMap.empty()) { + for (MachineFunction::VariableDbgInfo &VI : DbgDeclareInfo) { + auto I = ArgCopyElisionFrameIndexMap.find(VI.Slot); + if (I != ArgCopyElisionFrameIndexMap.end()) + VI.Slot = I->second; + } + } + // Finally, if the target has anything special to do, allow it to do so. EmitFunctionEntryCode(); } @@ -8402,13 +8930,10 @@ void SelectionDAGBuilder::updateDAGForMaybeTailCall(SDValue MaybeTC) { HasTailCall = true; } -bool SelectionDAGBuilder::isDense(const CaseClusterVector &Clusters, - const SmallVectorImpl<unsigned> &TotalCases, - unsigned First, unsigned Last, - unsigned Density) const { +uint64_t +SelectionDAGBuilder::getJumpTableRange(const CaseClusterVector &Clusters, + unsigned First, unsigned Last) const { assert(Last >= First); - assert(TotalCases[Last] >= TotalCases[First]); - const APInt &LowCase = Clusters[First].Low->getValue(); const APInt &HighCase = Clusters[Last].High->getValue(); assert(LowCase.getBitWidth() == HighCase.getBitWidth()); @@ -8417,26 +8942,17 @@ bool SelectionDAGBuilder::isDense(const CaseClusterVector &Clusters, // comparison to lower. We should discriminate against such consecutive ranges // in jump tables. - uint64_t Diff = (HighCase - LowCase).getLimitedValue((UINT64_MAX - 1) / 100); - uint64_t Range = Diff + 1; + return (HighCase - LowCase).getLimitedValue((UINT64_MAX - 1) / 100) + 1; +} +uint64_t SelectionDAGBuilder::getJumpTableNumCases( + const SmallVectorImpl<unsigned> &TotalCases, unsigned First, + unsigned Last) const { + assert(Last >= First); + assert(TotalCases[Last] >= TotalCases[First]); uint64_t NumCases = TotalCases[Last] - (First == 0 ? 0 : TotalCases[First - 1]); - - assert(NumCases < UINT64_MAX / 100); - assert(Range >= NumCases); - - return NumCases * 100 >= Range * Density; -} - -static inline bool areJTsAllowed(const TargetLowering &TLI, - const SwitchInst *SI) { - const Function *Fn = SI->getParent()->getParent(); - if (Fn->getFnAttribute("no-jump-tables").getValueAsString() == "true") - return false; - - return TLI.isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || - TLI.isOperationLegalOrCustom(ISD::BRIND, MVT::Other); + return NumCases; } bool SelectionDAGBuilder::buildJumpTable(const CaseClusterVector &Clusters, @@ -8475,10 +8991,11 @@ bool SelectionDAGBuilder::buildJumpTable(const CaseClusterVector &Clusters, JTProbs[Clusters[I].MBB] += Clusters[I].Prob; } + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumDests = JTProbs.size(); - if (isSuitableForBitTests(NumDests, NumCmps, - Clusters[First].Low->getValue(), - Clusters[Last].High->getValue())) { + if (TLI.isSuitableForBitTests( + NumDests, NumCmps, Clusters[First].Low->getValue(), + Clusters[Last].High->getValue(), DAG.getDataLayout())) { // Clusters[First..Last] should be lowered as bit tests instead. return false; } @@ -8499,7 +9016,6 @@ bool SelectionDAGBuilder::buildJumpTable(const CaseClusterVector &Clusters, } JumpTableMBB->normalizeSuccProbs(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned JTI = CurMF->getOrCreateJumpTableInfo(TLI.getJumpTableEncoding()) ->createJumpTableIndex(Table); @@ -8528,17 +9044,12 @@ void SelectionDAGBuilder::findJumpTables(CaseClusterVector &Clusters, #endif const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (!areJTsAllowed(TLI, SI)) + if (!TLI.areJTsAllowed(SI->getParent()->getParent())) return; - const bool OptForSize = DefaultMBB->getParent()->getFunction()->optForSize(); - const int64_t N = Clusters.size(); const unsigned MinJumpTableEntries = TLI.getMinimumJumpTableEntries(); const unsigned SmallNumberOfEntries = MinJumpTableEntries / 2; - const unsigned MaxJumpTableSize = - OptForSize || TLI.getMaximumJumpTableSize() == 0 - ? UINT_MAX : TLI.getMaximumJumpTableSize(); if (N < 2 || N < MinJumpTableEntries) return; @@ -8553,15 +9064,12 @@ void SelectionDAGBuilder::findJumpTables(CaseClusterVector &Clusters, TotalCases[i] += TotalCases[i - 1]; } - const unsigned MinDensity = - OptForSize ? OptsizeJumpTableDensity : JumpTableDensity; - // Cheap case: the whole range may be suitable for jump table. - unsigned JumpTableSize = (Clusters[N - 1].High->getValue() - - Clusters[0].Low->getValue()) - .getLimitedValue(UINT_MAX - 1) + 1; - if (JumpTableSize <= MaxJumpTableSize && - isDense(Clusters, TotalCases, 0, N - 1, MinDensity)) { + uint64_t Range = getJumpTableRange(Clusters,0, N - 1); + uint64_t NumCases = getJumpTableNumCases(TotalCases, 0, N - 1); + assert(NumCases < UINT64_MAX / 100); + assert(Range >= NumCases); + if (TLI.isSuitableForJumpTable(SI, NumCases, Range)) { CaseCluster JTCluster; if (buildJumpTable(Clusters, 0, N - 1, SI, DefaultMBB, JTCluster)) { Clusters[0] = JTCluster; @@ -8614,11 +9122,11 @@ void SelectionDAGBuilder::findJumpTables(CaseClusterVector &Clusters, // Search for a solution that results in fewer partitions. for (int64_t j = N - 1; j > i; j--) { // Try building a partition from Clusters[i..j]. - JumpTableSize = (Clusters[j].High->getValue() - - Clusters[i].Low->getValue()) - .getLimitedValue(UINT_MAX - 1) + 1; - if (JumpTableSize <= MaxJumpTableSize && - isDense(Clusters, TotalCases, i, j, MinDensity)) { + uint64_t Range = getJumpTableRange(Clusters, i, j); + uint64_t NumCases = getJumpTableNumCases(TotalCases, i, j); + assert(NumCases < UINT64_MAX / 100); + assert(Range >= NumCases); + if (TLI.isSuitableForJumpTable(SI, NumCases, Range)) { unsigned NumPartitions = 1 + (j == N - 1 ? 0 : MinPartitions[j + 1]); unsigned Score = j == N - 1 ? 0 : PartitionsScore[j + 1]; int64_t NumEntries = j - i + 1; @@ -8662,36 +9170,6 @@ void SelectionDAGBuilder::findJumpTables(CaseClusterVector &Clusters, Clusters.resize(DstIndex); } -bool SelectionDAGBuilder::rangeFitsInWord(const APInt &Low, const APInt &High) { - // FIXME: Using the pointer type doesn't seem ideal. - uint64_t BW = DAG.getDataLayout().getPointerSizeInBits(); - uint64_t Range = (High - Low).getLimitedValue(UINT64_MAX - 1) + 1; - return Range <= BW; -} - -bool SelectionDAGBuilder::isSuitableForBitTests(unsigned NumDests, - unsigned NumCmps, - const APInt &Low, - const APInt &High) { - // FIXME: I don't think NumCmps is the correct metric: a single case and a - // range of cases both require only one branch to lower. Just looking at the - // number of clusters and destinations should be enough to decide whether to - // build bit tests. - - // To lower a range with bit tests, the range must fit the bitwidth of a - // machine word. - if (!rangeFitsInWord(Low, High)) - return false; - - // Decide whether it's profitable to lower this range with bit tests. Each - // destination requires a bit test and branch, and there is an overall range - // check branch. For a small number of clusters, separate comparisons might be - // cheaper, and for many destinations, splitting the range might be better. - return (NumDests == 1 && NumCmps >= 3) || - (NumDests == 2 && NumCmps >= 5) || - (NumDests == 3 && NumCmps >= 6); -} - bool SelectionDAGBuilder::buildBitTests(CaseClusterVector &Clusters, unsigned First, unsigned Last, const SwitchInst *SI, @@ -8713,16 +9191,17 @@ bool SelectionDAGBuilder::buildBitTests(CaseClusterVector &Clusters, APInt High = Clusters[Last].High->getValue(); assert(Low.slt(High)); - if (!isSuitableForBitTests(NumDests, NumCmps, Low, High)) + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + const DataLayout &DL = DAG.getDataLayout(); + if (!TLI.isSuitableForBitTests(NumDests, NumCmps, Low, High, DL)) return false; APInt LowBound; APInt CmpRange; - const int BitWidth = DAG.getTargetLoweringInfo() - .getPointerTy(DAG.getDataLayout()) - .getSizeInBits(); - assert(rangeFitsInWord(Low, High) && "Case range must fit in bit mask!"); + const int BitWidth = TLI.getPointerTy(DL).getSizeInBits(); + assert(TLI.rangeFitsInWord(Low, High, DL) && + "Case range must fit in bit mask!"); // Check if the clusters cover a contiguous range such that no value in the // range will jump to the default statement. @@ -8812,7 +9291,9 @@ void SelectionDAGBuilder::findBitTestClusters(CaseClusterVector &Clusters, // If target does not have legal shift left, do not emit bit tests at all. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - EVT PTy = TLI.getPointerTy(DAG.getDataLayout()); + const DataLayout &DL = DAG.getDataLayout(); + + EVT PTy = TLI.getPointerTy(DL); if (!TLI.isOperationLegal(ISD::SHL, PTy)) return; @@ -8843,8 +9324,8 @@ void SelectionDAGBuilder::findBitTestClusters(CaseClusterVector &Clusters, // Try building a partition from Clusters[i..j]. // Check the range. - if (!rangeFitsInWord(Clusters[i].Low->getValue(), - Clusters[j].High->getValue())) + if (!TLI.rangeFitsInWord(Clusters[i].Low->getValue(), + Clusters[j].High->getValue(), DL)) continue; // Check nbr of destinations and cluster types. diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index abde8a8..ac1d6aa 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -38,7 +38,6 @@ class BranchInst; class CallInst; class DbgValueInst; class ExtractElementInst; -class ExtractValueInst; class FCmpInst; class FPExtInst; class FPToSIInst; @@ -53,7 +52,6 @@ class IntToPtrInst; class IndirectBrInst; class InvokeInst; class InsertElementInst; -class InsertValueInst; class Instruction; class LoadInst; class MachineBasicBlock; @@ -304,10 +302,13 @@ private: BranchProbability DefaultProb; }; - /// Check whether a range of clusters is dense enough for a jump table. - bool isDense(const CaseClusterVector &Clusters, - const SmallVectorImpl<unsigned> &TotalCases, - unsigned First, unsigned Last, unsigned MinDensity) const; + /// Return the range of value in [First..Last]. + uint64_t getJumpTableRange(const CaseClusterVector &Clusters, unsigned First, + unsigned Last) const; + + /// Return the number of cases in [First..Last]. + uint64_t getJumpTableNumCases(const SmallVectorImpl<unsigned> &TotalCases, + unsigned First, unsigned Last) const; /// Build a jump table cluster from Clusters[First..Last]. Returns false if it /// decides it's not a good idea. @@ -319,14 +320,6 @@ private: void findJumpTables(CaseClusterVector &Clusters, const SwitchInst *SI, MachineBasicBlock *DefaultMBB); - /// Check whether the range [Low,High] fits in a machine word. - bool rangeFitsInWord(const APInt &Low, const APInt &High); - - /// Check whether these clusters are suitable for lowering with bit tests based - /// on the number of destinations, comparison metric, and range. - bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps, - const APInt &Low, const APInt &High); - /// Build a bit test cluster from Clusters[First..Last]. Returns false if it /// decides it's not a good idea. bool buildBitTests(CaseClusterVector &Clusters, unsigned First, unsigned Last, @@ -609,40 +602,34 @@ public: SelectionDAGBuilder(SelectionDAG &dag, FunctionLoweringInfo &funcinfo, CodeGenOpt::Level ol) : CurInst(nullptr), SDNodeOrder(LowestSDNodeOrder), TM(dag.getTarget()), - DAG(dag), FuncInfo(funcinfo), + DAG(dag), DL(nullptr), AA(nullptr), FuncInfo(funcinfo), HasTailCall(false) { } - void init(GCFunctionInfo *gfi, AliasAnalysis &aa, + void init(GCFunctionInfo *gfi, AliasAnalysis *AA, const TargetLibraryInfo *li); - /// clear - Clear out the current SelectionDAG and the associated - /// state and prepare this SelectionDAGBuilder object to be used - /// for a new block. This doesn't clear out information about - /// additional blocks that are needed to complete switch lowering - /// or PHI node updating; that information is cleared out as it is - /// consumed. + /// Clear out the current SelectionDAG and the associated state and prepare + /// this SelectionDAGBuilder object to be used for a new block. This doesn't + /// clear out information about additional blocks that are needed to complete + /// switch lowering or PHI node updating; that information is cleared out as + /// it is consumed. void clear(); - /// clearDanglingDebugInfo - Clear the dangling debug information - /// map. This function is separated from the clear so that debug - /// information that is dangling in a basic block can be properly - /// resolved in a different basic block. This allows the - /// SelectionDAG to resolve dangling debug information attached - /// to PHI nodes. + /// Clear the dangling debug information map. This function is separated from + /// the clear so that debug information that is dangling in a basic block can + /// be properly resolved in a different basic block. This allows the + /// SelectionDAG to resolve dangling debug information attached to PHI nodes. void clearDanglingDebugInfo(); - /// getRoot - Return the current virtual root of the Selection DAG, - /// flushing any PendingLoad items. This must be done before emitting - /// a store or any other node that may need to be ordered after any - /// prior load instructions. - /// + /// Return the current virtual root of the Selection DAG, flushing any + /// PendingLoad items. This must be done before emitting a store or any other + /// node that may need to be ordered after any prior load instructions. SDValue getRoot(); - /// getControlRoot - Similar to getRoot, but instead of flushing all the - /// PendingLoad items, flush all the PendingExports items. It is necessary - /// to do this before emitting a terminator instruction. - /// + /// Similar to getRoot, but instead of flushing all the PendingLoad items, + /// flush all the PendingExports items. It is necessary to do this before + /// emitting a terminator instruction. SDValue getControlRoot(); SDLoc getCurSDLoc() const { @@ -688,12 +675,13 @@ public: MachineBasicBlock *FBB, MachineBasicBlock *CurBB, MachineBasicBlock *SwitchBB, Instruction::BinaryOps Opc, BranchProbability TW, - BranchProbability FW); + BranchProbability FW, bool InvertCond); void EmitBranchForMergedCondition(const Value *Cond, MachineBasicBlock *TBB, MachineBasicBlock *FBB, MachineBasicBlock *CurBB, MachineBasicBlock *SwitchBB, - BranchProbability TW, BranchProbability FW); + BranchProbability TW, BranchProbability FW, + bool InvertCond); bool ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases); bool isExportableFromCurrentBlock(const Value *V, const BasicBlock *FromBB); void CopyToExportRegsIfNeeded(const Value *V); @@ -782,6 +770,11 @@ public: bool VarArgDisallowed, bool ForceVoidReturnTy); + /// Returns the type of FrameIndex and TargetFrameIndex nodes. + MVT getFrameIndexTy() { + return DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()); + } + private: // Terminator instructions. void visitRet(const ReturnInst &I); @@ -864,8 +857,8 @@ private: void visitInsertElement(const User &I); void visitShuffleVector(const User &I); - void visitExtractValue(const ExtractValueInst &I); - void visitInsertValue(const InsertValueInst &I); + void visitExtractValue(const User &I); + void visitInsertValue(const User &I); void visitLandingPad(const LandingPadInst &I); void visitGetElementPtr(const User &I); @@ -900,6 +893,7 @@ private: void visitInlineAsm(ImmutableCallSite CS); const char *visitIntrinsicCall(const CallInst &I, unsigned Intrinsic); void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic); + void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI); void visitVAStart(const CallInst &I); void visitVAArg(const VAArgInst &I); @@ -913,6 +907,8 @@ private: void visitGCRelocate(const GCRelocateInst &I); void visitGCResult(const GCResultInst &I); + void visitVectorReduce(const CallInst &I, unsigned Intrinsic); + void visitUserOp1(const Instruction &I) { llvm_unreachable("UserOp1 should not exist at instruction selection time!"); } @@ -932,7 +928,7 @@ private: /// instruction selection, they will be inserted to the entry BB. bool EmitFuncArgumentDbgValue(const Value *V, DILocalVariable *Variable, DIExpression *Expr, DILocation *DL, - int64_t Offset, bool IsIndirect, + int64_t Offset, bool IsDbgDeclare, const SDValue &N); /// Return the next block after MBB, or nullptr if there is none. @@ -944,8 +940,8 @@ private: /// Return the appropriate SDDbgValue based on N. SDDbgValue *getDbgValue(SDValue N, DILocalVariable *Variable, - DIExpression *Expr, int64_t Offset, DebugLoc dl, - unsigned DbgSDNodeOrder); + DIExpression *Expr, int64_t Offset, + const DebugLoc &dl, unsigned DbgSDNodeOrder); }; /// RegsForValue - This struct represents the registers (physical or virtual) @@ -958,62 +954,69 @@ private: /// type. /// struct RegsForValue { - /// ValueVTs - The value types of the values, which may not be legal, and + /// The value types of the values, which may not be legal, and /// may need be promoted or synthesized from one or more registers. - /// SmallVector<EVT, 4> ValueVTs; - /// RegVTs - The value types of the registers. This is the same size as - /// ValueVTs and it records, for each value, what the type of the assigned - /// register or registers are. (Individual values are never synthesized - /// from more than one type of register.) + /// The value types of the registers. This is the same size as ValueVTs and it + /// records, for each value, what the type of the assigned register or + /// registers are. (Individual values are never synthesized from more than one + /// type of register.) /// /// With virtual registers, the contents of RegVTs is redundant with TLI's /// getRegisterType member function, however when with physical registers /// it is necessary to have a separate record of the types. - /// SmallVector<MVT, 4> RegVTs; - /// Regs - This list holds the registers assigned to the values. + /// This list holds the registers assigned to the values. /// Each legal or promoted value requires one register, and each /// expanded value requires multiple registers. - /// SmallVector<unsigned, 4> Regs; + /// This list holds the number of registers for each value. + SmallVector<unsigned, 4> RegCount; + + /// Records if this value needs to be treated in an ABI dependant manner, + /// different to normal type legalization. + bool IsABIMangled; + RegsForValue(); - RegsForValue(const SmallVector<unsigned, 4> ®s, MVT regvt, EVT valuevt); + RegsForValue(const SmallVector<unsigned, 4> ®s, MVT regvt, EVT valuevt, + bool IsABIMangledValue = false); RegsForValue(LLVMContext &Context, const TargetLowering &TLI, - const DataLayout &DL, unsigned Reg, Type *Ty); + const DataLayout &DL, unsigned Reg, Type *Ty, + bool IsABIMangledValue = false); - /// append - Add the specified values to this one. + /// Add the specified values to this one. void append(const RegsForValue &RHS) { ValueVTs.append(RHS.ValueVTs.begin(), RHS.ValueVTs.end()); RegVTs.append(RHS.RegVTs.begin(), RHS.RegVTs.end()); Regs.append(RHS.Regs.begin(), RHS.Regs.end()); + RegCount.push_back(RHS.Regs.size()); } - /// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from - /// this value and returns the result as a ValueVTs value. This uses - /// Chain/Flag as the input and updates them for the output Chain/Flag. - /// If the Flag pointer is NULL, no flag is used. + /// Emit a series of CopyFromReg nodes that copies from this value and returns + /// the result as a ValueVTs value. This uses Chain/Flag as the input and + /// updates them for the output Chain/Flag. If the Flag pointer is NULL, no + /// flag is used. SDValue getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo, const SDLoc &dl, SDValue &Chain, SDValue *Flag, const Value *V = nullptr) const; - /// getCopyToRegs - Emit a series of CopyToReg nodes that copies the specified - /// value into the registers specified by this object. This uses Chain/Flag - /// as the input and updates them for the output Chain/Flag. If the Flag - /// pointer is nullptr, no flag is used. If V is not nullptr, then it is used - /// in printing better diagnostic messages on error. + /// Emit a series of CopyToReg nodes that copies the specified value into the + /// registers specified by this object. This uses Chain/Flag as the input and + /// updates them for the output Chain/Flag. If the Flag pointer is nullptr, no + /// flag is used. If V is not nullptr, then it is used in printing better + /// diagnostic messages on error. void getCopyToRegs(SDValue Val, SelectionDAG &DAG, const SDLoc &dl, SDValue &Chain, SDValue *Flag, const Value *V = nullptr, ISD::NodeType PreferredExtendType = ISD::ANY_EXTEND) const; - /// AddInlineAsmOperands - Add this value to the specified inlineasm node - /// operand list. This adds the code marker, matching input operand index - /// (if applicable), and includes the number of values added into it. + /// Add this value to the specified inlineasm node operand list. This adds the + /// code marker, matching input operand index (if applicable), and includes + /// the number of values added into it. void AddInlineAsmOperands(unsigned Kind, bool HasMatching, unsigned MatchingIdx, const SDLoc &dl, SelectionDAG &DAG, std::vector<SDValue> &Ops) const; diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 0faaad8..3dd5897 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -11,12 +11,12 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/SelectionDAG.h" #include "ScheduleDAGSDNodes.h" #include "llvm/ADT/StringExtras.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/Intrinsics.h" @@ -214,6 +214,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::FPOWI: return "fpowi"; case ISD::SETCC: return "setcc"; case ISD::SETCCE: return "setcce"; + case ISD::SETCCCARRY: return "setcccarry"; case ISD::SELECT: return "select"; case ISD::VSELECT: return "vselect"; case ISD::SELECT_CC: return "select_cc"; @@ -227,6 +228,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::CARRY_FALSE: return "carry_false"; case ISD::ADDC: return "addc"; case ISD::ADDE: return "adde"; + case ISD::ADDCARRY: return "addcarry"; case ISD::SADDO: return "saddo"; case ISD::UADDO: return "uaddo"; case ISD::SSUBO: return "ssubo"; @@ -235,6 +237,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::UMULO: return "umulo"; case ISD::SUBC: return "subc"; case ISD::SUBE: return "sube"; + case ISD::SUBCARRY: return "subcarry"; case ISD::SHL_PARTS: return "shl_parts"; case ISD::SRA_PARTS: return "sra_parts"; case ISD::SRL_PARTS: return "srl_parts"; @@ -300,6 +303,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::GET_DYNAMIC_AREA_OFFSET: return "get.dynamic.area.offset"; // Bit manipulation + case ISD::ABS: return "abs"; case ISD::BITREVERSE: return "bitreverse"; case ISD::BSWAP: return "bswap"; case ISD::CTPOP: return "ctpop"; @@ -343,6 +347,19 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::SETFALSE: return "setfalse"; case ISD::SETFALSE2: return "setfalse2"; } + case ISD::VECREDUCE_FADD: return "vecreduce_fadd"; + case ISD::VECREDUCE_FMUL: return "vecreduce_fmul"; + case ISD::VECREDUCE_ADD: return "vecreduce_add"; + case ISD::VECREDUCE_MUL: return "vecreduce_mul"; + case ISD::VECREDUCE_AND: return "vecreduce_and"; + case ISD::VECREDUCE_OR: return "vecreduce_or"; + case ISD::VECREDUCE_XOR: return "vecreduce_xor"; + case ISD::VECREDUCE_SMAX: return "vecreduce_smax"; + case ISD::VECREDUCE_SMIN: return "vecreduce_smin"; + case ISD::VECREDUCE_UMAX: return "vecreduce_umax"; + case ISD::VECREDUCE_UMIN: return "vecreduce_umin"; + case ISD::VECREDUCE_FMAX: return "vecreduce_fmax"; + case ISD::VECREDUCE_FMIN: return "vecreduce_fmin"; } } @@ -366,11 +383,13 @@ static Printable PrintNodeId(const SDNode &Node) { }); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void SDNode::dump() const { dump(nullptr); } -void SDNode::dump(const SelectionDAG *G) const { +LLVM_DUMP_METHOD void SDNode::dump(const SelectionDAG *G) const { print(dbgs(), G); dbgs() << '\n'; } +#endif void SDNode::print_types(raw_ostream &OS, const SelectionDAG *G) const { for (unsigned i = 0, e = getNumValues(); i != e; ++i) { @@ -416,7 +435,7 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const { OS << '<' << CSDN->getValueAPF().convertToDouble() << '>'; else { OS << "<APFloat("; - CSDN->getValueAPF().bitcastToAPInt().dump(); + CSDN->getValueAPF().bitcastToAPInt().print(OS, false); OS << ")>"; } } else if (const GlobalAddressSDNode *GADN = @@ -566,6 +585,7 @@ static bool shouldPrintInline(const SDNode &Node) { return Node.getNumOperands() == 0; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) static void DumpNodes(const SDNode *N, unsigned indent, const SelectionDAG *G) { for (const SDValue &Op : N->op_values()) { if (shouldPrintInline(*Op.getNode())) @@ -592,6 +612,7 @@ LLVM_DUMP_METHOD void SelectionDAG::dump() const { if (getRoot().getNode()) DumpNodes(getRoot().getNode(), 2, this); dbgs() << "\n\n"; } +#endif void SDNode::printr(raw_ostream &OS, const SelectionDAG *G) const { OS << PrintNodeId(*this) << ": "; @@ -618,6 +639,7 @@ static bool printOperand(raw_ostream &OS, const SelectionDAG *G, } } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) typedef SmallPtrSet<const SDNode *, 32> VisitedSDNodeSet; static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent, const SelectionDAG *G, VisitedSDNodeSet &once) { @@ -646,15 +668,16 @@ static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent, DumpNodesr(OS, Op.getNode(), indent+2, G, once); } -void SDNode::dumpr() const { +LLVM_DUMP_METHOD void SDNode::dumpr() const { VisitedSDNodeSet once; DumpNodesr(dbgs(), this, 0, nullptr, once); } -void SDNode::dumpr(const SelectionDAG *G) const { +LLVM_DUMP_METHOD void SDNode::dumpr(const SelectionDAG *G) const { VisitedSDNodeSet once; DumpNodesr(dbgs(), this, 0, G, once); } +#endif static void printrWithDepthHelper(raw_ostream &OS, const SDNode *N, const SelectionDAG *G, unsigned depth, @@ -688,14 +711,17 @@ void SDNode::printrFull(raw_ostream &OS, const SelectionDAG *G) const { printrWithDepth(OS, G, 10); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void SDNode::dumprWithDepth(const SelectionDAG *G, unsigned depth) const { printrWithDepth(dbgs(), G, depth); } -void SDNode::dumprFull(const SelectionDAG *G) const { +LLVM_DUMP_METHOD void SDNode::dumprFull(const SelectionDAG *G) const { // Don't print impossibly deep things. dumprWithDepth(G, 10); } +#endif void SDNode::print(raw_ostream &OS, const SelectionDAG *G) const { printr(OS, G); diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 64e6c22..bdf57e8 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -1,4 +1,4 @@ -//===-- SelectionDAGISel.cpp - Implement the SelectionDAGISel class -------===// +//===- SelectionDAGISel.cpp - Implement the SelectionDAGISel class --------===// // // The LLVM Compiler Infrastructure // @@ -11,43 +11,73 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/SelectionDAG.h" #include "ScheduleDAGSDNodes.h" #include "SelectionDAGBuilder.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/None.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/EHPersonalities.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GCMetadata.h" -#include "llvm/CodeGen/GCStrategy.h" +#include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachinePassRegistry.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/SchedulerRegistry.h" +#include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/StackProtector.h" -#include "llvm/CodeGen/WinEHFuncInfo.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/MC/MCAsmInfo.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/Timer.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" @@ -59,6 +89,14 @@ #include "llvm/Target/TargetSubtargetInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> +#include <limits> +#include <memory> +#include <string> +#include <utility> +#include <vector> using namespace llvm; @@ -73,104 +111,6 @@ STATISTIC(NumEntryBlocks, "Number of entry blocks encountered"); STATISTIC(NumFastIselFailLowerArguments, "Number of entry blocks where fast isel failed to lower arguments"); -#ifndef NDEBUG -static cl::opt<bool> -EnableFastISelVerbose2("fast-isel-verbose2", cl::Hidden, - cl::desc("Enable extra verbose messages in the \"fast\" " - "instruction selector")); - - // Terminators -STATISTIC(NumFastIselFailRet,"Fast isel fails on Ret"); -STATISTIC(NumFastIselFailBr,"Fast isel fails on Br"); -STATISTIC(NumFastIselFailSwitch,"Fast isel fails on Switch"); -STATISTIC(NumFastIselFailIndirectBr,"Fast isel fails on IndirectBr"); -STATISTIC(NumFastIselFailInvoke,"Fast isel fails on Invoke"); -STATISTIC(NumFastIselFailResume,"Fast isel fails on Resume"); -STATISTIC(NumFastIselFailUnreachable,"Fast isel fails on Unreachable"); - - // Standard binary operators... -STATISTIC(NumFastIselFailAdd,"Fast isel fails on Add"); -STATISTIC(NumFastIselFailFAdd,"Fast isel fails on FAdd"); -STATISTIC(NumFastIselFailSub,"Fast isel fails on Sub"); -STATISTIC(NumFastIselFailFSub,"Fast isel fails on FSub"); -STATISTIC(NumFastIselFailMul,"Fast isel fails on Mul"); -STATISTIC(NumFastIselFailFMul,"Fast isel fails on FMul"); -STATISTIC(NumFastIselFailUDiv,"Fast isel fails on UDiv"); -STATISTIC(NumFastIselFailSDiv,"Fast isel fails on SDiv"); -STATISTIC(NumFastIselFailFDiv,"Fast isel fails on FDiv"); -STATISTIC(NumFastIselFailURem,"Fast isel fails on URem"); -STATISTIC(NumFastIselFailSRem,"Fast isel fails on SRem"); -STATISTIC(NumFastIselFailFRem,"Fast isel fails on FRem"); - - // Logical operators... -STATISTIC(NumFastIselFailAnd,"Fast isel fails on And"); -STATISTIC(NumFastIselFailOr,"Fast isel fails on Or"); -STATISTIC(NumFastIselFailXor,"Fast isel fails on Xor"); - - // Memory instructions... -STATISTIC(NumFastIselFailAlloca,"Fast isel fails on Alloca"); -STATISTIC(NumFastIselFailLoad,"Fast isel fails on Load"); -STATISTIC(NumFastIselFailStore,"Fast isel fails on Store"); -STATISTIC(NumFastIselFailAtomicCmpXchg,"Fast isel fails on AtomicCmpXchg"); -STATISTIC(NumFastIselFailAtomicRMW,"Fast isel fails on AtomicRWM"); -STATISTIC(NumFastIselFailFence,"Fast isel fails on Frence"); -STATISTIC(NumFastIselFailGetElementPtr,"Fast isel fails on GetElementPtr"); - - // Convert instructions... -STATISTIC(NumFastIselFailTrunc,"Fast isel fails on Trunc"); -STATISTIC(NumFastIselFailZExt,"Fast isel fails on ZExt"); -STATISTIC(NumFastIselFailSExt,"Fast isel fails on SExt"); -STATISTIC(NumFastIselFailFPTrunc,"Fast isel fails on FPTrunc"); -STATISTIC(NumFastIselFailFPExt,"Fast isel fails on FPExt"); -STATISTIC(NumFastIselFailFPToUI,"Fast isel fails on FPToUI"); -STATISTIC(NumFastIselFailFPToSI,"Fast isel fails on FPToSI"); -STATISTIC(NumFastIselFailUIToFP,"Fast isel fails on UIToFP"); -STATISTIC(NumFastIselFailSIToFP,"Fast isel fails on SIToFP"); -STATISTIC(NumFastIselFailIntToPtr,"Fast isel fails on IntToPtr"); -STATISTIC(NumFastIselFailPtrToInt,"Fast isel fails on PtrToInt"); -STATISTIC(NumFastIselFailBitCast,"Fast isel fails on BitCast"); - - // Other instructions... -STATISTIC(NumFastIselFailICmp,"Fast isel fails on ICmp"); -STATISTIC(NumFastIselFailFCmp,"Fast isel fails on FCmp"); -STATISTIC(NumFastIselFailPHI,"Fast isel fails on PHI"); -STATISTIC(NumFastIselFailSelect,"Fast isel fails on Select"); -STATISTIC(NumFastIselFailCall,"Fast isel fails on Call"); -STATISTIC(NumFastIselFailShl,"Fast isel fails on Shl"); -STATISTIC(NumFastIselFailLShr,"Fast isel fails on LShr"); -STATISTIC(NumFastIselFailAShr,"Fast isel fails on AShr"); -STATISTIC(NumFastIselFailVAArg,"Fast isel fails on VAArg"); -STATISTIC(NumFastIselFailExtractElement,"Fast isel fails on ExtractElement"); -STATISTIC(NumFastIselFailInsertElement,"Fast isel fails on InsertElement"); -STATISTIC(NumFastIselFailShuffleVector,"Fast isel fails on ShuffleVector"); -STATISTIC(NumFastIselFailExtractValue,"Fast isel fails on ExtractValue"); -STATISTIC(NumFastIselFailInsertValue,"Fast isel fails on InsertValue"); -STATISTIC(NumFastIselFailLandingPad,"Fast isel fails on LandingPad"); - -// Intrinsic instructions... -STATISTIC(NumFastIselFailIntrinsicCall, "Fast isel fails on Intrinsic call"); -STATISTIC(NumFastIselFailSAddWithOverflow, - "Fast isel fails on sadd.with.overflow"); -STATISTIC(NumFastIselFailUAddWithOverflow, - "Fast isel fails on uadd.with.overflow"); -STATISTIC(NumFastIselFailSSubWithOverflow, - "Fast isel fails on ssub.with.overflow"); -STATISTIC(NumFastIselFailUSubWithOverflow, - "Fast isel fails on usub.with.overflow"); -STATISTIC(NumFastIselFailSMulWithOverflow, - "Fast isel fails on smul.with.overflow"); -STATISTIC(NumFastIselFailUMulWithOverflow, - "Fast isel fails on umul.with.overflow"); -STATISTIC(NumFastIselFailFrameaddress, "Fast isel fails on Frameaddress"); -STATISTIC(NumFastIselFailSqrt, "Fast isel fails on sqrt call"); -STATISTIC(NumFastIselFailStackMap, "Fast isel fails on StackMap call"); -STATISTIC(NumFastIselFailPatchPoint, "Fast isel fails on PatchPoint call"); -#endif - -static cl::opt<bool> -EnableFastISelVerbose("fast-isel-verbose", cl::Hidden, - cl::desc("Enable verbose messages in the \"fast\" " - "instruction selector")); static cl::opt<int> EnableFastISelAbort( "fast-isel-abort", cl::Hidden, cl::desc("Enable abort calls when \"fast\" instruction selection " @@ -179,6 +119,11 @@ static cl::opt<int> EnableFastISelAbort( "abort for argument lowering, and 3 will never fallback " "to SelectionDAG.")); +static cl::opt<bool> EnableFastISelFallbackReport( + "fast-isel-report-on-fallback", cl::Hidden, + cl::desc("Emit a diagnostic when \"fast\" instruction selection " + "falls back to SelectionDAG.")); + static cl::opt<bool> UseMBPI("use-mbpi", cl::desc("use Machine Branch Probability Info"), @@ -238,7 +183,7 @@ MachinePassRegistry RegisterScheduler::Registry; /// //===---------------------------------------------------------------------===// static cl::opt<RegisterScheduler::FunctionPassCtor, false, - RegisterPassParser<RegisterScheduler> > + RegisterPassParser<RegisterScheduler>> ISHeuristic("pre-RA-sched", cl::init(&createDefaultScheduler), cl::Hidden, cl::desc("Instruction schedulers available (before register" @@ -249,6 +194,7 @@ defaultListDAGScheduler("default", "Best scheduler for the target", createDefaultScheduler); namespace llvm { + //===--------------------------------------------------------------------===// /// \brief This class is used by SelectionDAGISel to temporarily override /// the optimization level on a per-function basis. @@ -318,6 +264,7 @@ namespace llvm { "Unknown sched type!"); return createILPListDAGScheduler(IS, OptLevel); } + } // end namespace llvm // EmitInstrWithCustomInserter - This method should be implemented by targets @@ -357,7 +304,7 @@ SelectionDAGISel::SelectionDAGISel(TargetMachine &tm, FuncInfo(new FunctionLoweringInfo()), CurDAG(new SelectionDAG(tm, OL)), SDB(new SelectionDAGBuilder(*CurDAG, *FuncInfo, OL)), - GFI(), + AA(), GFI(), OptLevel(OL), DAGSize(0) { initializeGCModuleInfoPass(*PassRegistry::getPassRegistry()); @@ -375,7 +322,8 @@ SelectionDAGISel::~SelectionDAGISel() { } void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<AAResultsWrapperPass>(); + if (OptLevel != CodeGenOpt::None) + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<GCModuleInfo>(); AU.addRequired<StackProtector>(); AU.addPreserved<StackProtector>(); @@ -389,11 +337,13 @@ void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const { /// SplitCriticalSideEffectEdges - Look for critical edges with a PHI value that /// may trap on it. In this case we have to split the edge so that the path /// through the predecessor block that doesn't go to the phi block doesn't -/// execute the possibly trapping instruction. -/// +/// execute the possibly trapping instruction. If available, we pass domtree +/// and loop info to be updated when we split critical edges. This is because +/// SelectionDAGISel preserves these analyses. /// This is required for correctness, so it must be done at -O0. /// -static void SplitCriticalSideEffectEdges(Function &Fn) { +static void SplitCriticalSideEffectEdges(Function &Fn, DominatorTree *DT, + LoopInfo *LI) { // Loop for blocks with phi nodes. for (BasicBlock &BB : Fn) { PHINode *PN = dyn_cast<PHINode>(BB.begin()); @@ -419,7 +369,7 @@ static void SplitCriticalSideEffectEdges(Function &Fn) { // Okay, we have to split this edge. SplitCriticalEdge( Pred->getTerminator(), GetSuccessorNumber(Pred, &BB), - CriticalEdgeSplittingOptions().setMergeIdenticalEdges()); + CriticalEdgeSplittingOptions(DT, LI).setMergeIdenticalEdges()); goto ReprocessBlock; } } @@ -431,8 +381,6 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { MachineFunctionProperties::Property::Selected)) return false; // Do some sanity-checking on the command-line options. - assert((!EnableFastISelVerbose || TM.Options.EnableFastISel) && - "-fast-isel-verbose requires -fast-isel"); assert((!EnableFastISelAbort || TM.Options.EnableFastISel) && "-fast-isel-abort > 0 requires -fast-isel"); @@ -454,23 +402,37 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { TII = MF->getSubtarget().getInstrInfo(); TLI = MF->getSubtarget().getTargetLowering(); RegInfo = &MF->getRegInfo(); - AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr; + ORE = make_unique<OptimizationRemarkEmitter>(&Fn); + auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; + auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); + LoopInfo *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n"); - SplitCriticalSideEffectEdges(const_cast<Function &>(Fn)); + SplitCriticalSideEffectEdges(const_cast<Function &>(Fn), DT, LI); - CurDAG->init(*MF); + CurDAG->init(*MF, *ORE); FuncInfo->set(Fn, *MF, CurDAG); + // Now get the optional analyzes if we want to. + // This is based on the possibly changed OptLevel (after optnone is taken + // into account). That's unfortunate but OK because it just means we won't + // ask for passes that have been required anyway. + if (UseMBPI && OptLevel != CodeGenOpt::None) FuncInfo->BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); else FuncInfo->BPI = nullptr; - SDB->init(GFI, *AA, LibInfo); + if (OptLevel != CodeGenOpt::None) + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); + else + AA = nullptr; + + SDB->init(GFI, AA, LibInfo); MF->setHasInlineAsm(false); @@ -502,6 +464,10 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { TLI->initializeSplitCSR(EntryMBB); SelectAllBasicBlocks(Fn); + if (FastISelFailed && EnableFastISelFallbackReport) { + DiagnosticInfoISelFallback DiagFallback(Fn); + Fn.getContext().diagnose(DiagFallback); + } // If the first basic block in the function has live ins that need to be // copied into vregs, emit the copies into the top of the block before @@ -628,7 +594,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { unsigned To = I->second; // If To is also scheduled to be replaced, find what its ultimate // replacement is. - for (;;) { + while (true) { DenseMap<unsigned, unsigned>::iterator J = FuncInfo->RegFixups.find(To); if (J == E) break; To = J->second; @@ -648,13 +614,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { MRI.replaceRegWith(From, To); } - if (TLI->hasCopyImplyingStackAdjustment(MF)) - MFI.setHasCopyImplyingStackAdjustment(true); - - // Freeze the set of reserved registers now that MachineFrameInfo has been - // set up. All the information required by getReservedRegs() should be - // available now. - MRI.freezeReservedRegs(*MF); + TLI->finalizeLowering(*MF); // Release function-specific state. SDB and CurDAG are already cleared // at this point. @@ -666,13 +626,30 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { return true; } +static void reportFastISelFailure(MachineFunction &MF, + OptimizationRemarkEmitter &ORE, + OptimizationRemarkMissed &R, + bool ShouldAbort) { + // Print the function name explicitly if we don't have a debug location (which + // makes the diagnostic less useful) or if we're going to emit a raw error. + if (!R.getLocation().isValid() || ShouldAbort) + R << (" (in function: " + MF.getName() + ")").str(); + + if (ShouldAbort) + report_fatal_error(R.getMsg()); + + ORE.emit(R); +} + void SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin, BasicBlock::const_iterator End, bool &HadTailCall) { // Lower the instructions. If a call is emitted as a tail call, cease emitting // nodes for this block. - for (BasicBlock::const_iterator I = Begin; I != End && !SDB->HasTailCall; ++I) - SDB->visit(*I); + for (BasicBlock::const_iterator I = Begin; I != End && !SDB->HasTailCall; ++I) { + if (!ElidedArgCopyInstrs.count(&*I)) + SDB->visit(*I); + } // Make sure the root of the DAG is up-to-date. CurDAG->setRoot(SDB->getControlRoot()); @@ -689,8 +666,7 @@ void SelectionDAGISel::ComputeLiveOutVRegInfo() { Worklist.push_back(CurDAG->getRoot().getNode()); - APInt KnownZero; - APInt KnownOne; + KnownBits Known; do { SDNode *N = Worklist.pop_back_val(); @@ -719,8 +695,8 @@ void SelectionDAGISel::ComputeLiveOutVRegInfo() { continue; unsigned NumSignBits = CurDAG->ComputeNumSignBits(Src); - CurDAG->computeKnownBits(Src, KnownZero, KnownOne); - FuncInfo->AddLiveOutRegInfo(DestReg, NumSignBits, KnownZero, KnownOne); + CurDAG->computeKnownBits(Src, Known); + FuncInfo->AddLiveOutRegInfo(DestReg, NumSignBits, Known); } while (!Worklist.empty()); } @@ -731,6 +707,10 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { int BlockNumber = -1; (void)BlockNumber; bool MatchFilterBB = false; (void)MatchFilterBB; + + // Pre-type legalization allow creation of any node types. + CurDAG->NewNodesMustHaveLegalTypes = false; + #ifndef NDEBUG MatchFilterBB = (FilterDAGBasicBlockName.empty() || FilterDAGBasicBlockName == @@ -756,7 +736,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { { NamedRegionTimer T("combine1", "DAG Combining 1", GroupName, GroupDescription, TimePassesIsEnabled); - CurDAG->Combine(BeforeLegalizeTypes, *AA, OptLevel); + CurDAG->Combine(BeforeLegalizeTypes, AA, OptLevel); } DEBUG(dbgs() << "Optimized lowered selection DAG: BB#" << BlockNumber @@ -777,6 +757,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { DEBUG(dbgs() << "Type-legalized selection DAG: BB#" << BlockNumber << " '" << BlockName << "'\n"; CurDAG->dump()); + // Only allow creation of legal node types. CurDAG->NewNodesMustHaveLegalTypes = true; if (Changed) { @@ -787,12 +768,11 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { { NamedRegionTimer T("combine_lt", "DAG Combining after legalize types", GroupName, GroupDescription, TimePassesIsEnabled); - CurDAG->Combine(AfterLegalizeTypes, *AA, OptLevel); + CurDAG->Combine(AfterLegalizeTypes, AA, OptLevel); } DEBUG(dbgs() << "Optimized type-legalized selection DAG: BB#" << BlockNumber << " '" << BlockName << "'\n"; CurDAG->dump()); - } { @@ -802,12 +782,18 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { } if (Changed) { + DEBUG(dbgs() << "Vector-legalized selection DAG: BB#" << BlockNumber + << " '" << BlockName << "'\n"; CurDAG->dump()); + { NamedRegionTimer T("legalize_types2", "Type Legalization 2", GroupName, GroupDescription, TimePassesIsEnabled); CurDAG->LegalizeTypes(); } + DEBUG(dbgs() << "Vector/type-legalized selection DAG: BB#" << BlockNumber + << " '" << BlockName << "'\n"; CurDAG->dump()); + if (ViewDAGCombineLT && MatchFilterBB) CurDAG->viewGraph("dag-combine-lv input for " + BlockName); @@ -815,7 +801,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { { NamedRegionTimer T("combine_lv", "DAG Combining after legalize vectors", GroupName, GroupDescription, TimePassesIsEnabled); - CurDAG->Combine(AfterLegalizeVectorOps, *AA, OptLevel); + CurDAG->Combine(AfterLegalizeVectorOps, AA, OptLevel); } DEBUG(dbgs() << "Optimized vector-legalized selection DAG: BB#" @@ -841,7 +827,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { { NamedRegionTimer T("combine2", "DAG Combining 2", GroupName, GroupDescription, TimePassesIsEnabled); - CurDAG->Combine(AfterLegalizeDAG, *AA, OptLevel); + CurDAG->Combine(AfterLegalizeDAG, AA, OptLevel); } DEBUG(dbgs() << "Optimized legalized selection DAG: BB#" << BlockNumber @@ -907,10 +893,12 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { } namespace { + /// ISelUpdater - helper class to handle updates of the instruction selection /// graph. class ISelUpdater : public SelectionDAG::DAGUpdateListener { SelectionDAG::allnodes_iterator &ISelPosition; + public: ISelUpdater(SelectionDAG &DAG, SelectionDAG::allnodes_iterator &isp) : SelectionDAG::DAGUpdateListener(DAG), ISelPosition(isp) {} @@ -923,6 +911,7 @@ public: ++ISelPosition; } }; + } // end anonymous namespace void SelectionDAGISel::DoInstructionSelection() { @@ -960,6 +949,19 @@ void SelectionDAGISel::DoInstructionSelection() { if (Node->use_empty()) continue; + // When we are using non-default rounding modes or FP exception behavior + // FP operations are represented by StrictFP pseudo-operations. They + // need to be simplified here so that the target-specific instruction + // selectors know how to handle them. + // + // If the current node is a strict FP pseudo-op, the isStrictFPOp() + // function will provide the corresponding normal FP opcode to which the + // node should be mutated. + // + // FIXME: The backends need a way to handle FP constraints. + if (Node->isStrictFPOpcode()) + Node = CurDAG->mutateStrictFPToFP(Node); + Select(Node); } @@ -1046,116 +1048,6 @@ static bool isFoldedOrDeadInstruction(const Instruction *I, !FuncInfo->isExportedInst(I); // Exported instrs must be computed. } -#ifndef NDEBUG -// Collect per Instruction statistics for fast-isel misses. Only those -// instructions that cause the bail are accounted for. It does not account for -// instructions higher in the block. Thus, summing the per instructions stats -// will not add up to what is reported by NumFastIselFailures. -static void collectFailStats(const Instruction *I) { - switch (I->getOpcode()) { - default: assert (0 && "<Invalid operator> "); - - // Terminators - case Instruction::Ret: NumFastIselFailRet++; return; - case Instruction::Br: NumFastIselFailBr++; return; - case Instruction::Switch: NumFastIselFailSwitch++; return; - case Instruction::IndirectBr: NumFastIselFailIndirectBr++; return; - case Instruction::Invoke: NumFastIselFailInvoke++; return; - case Instruction::Resume: NumFastIselFailResume++; return; - case Instruction::Unreachable: NumFastIselFailUnreachable++; return; - - // Standard binary operators... - case Instruction::Add: NumFastIselFailAdd++; return; - case Instruction::FAdd: NumFastIselFailFAdd++; return; - case Instruction::Sub: NumFastIselFailSub++; return; - case Instruction::FSub: NumFastIselFailFSub++; return; - case Instruction::Mul: NumFastIselFailMul++; return; - case Instruction::FMul: NumFastIselFailFMul++; return; - case Instruction::UDiv: NumFastIselFailUDiv++; return; - case Instruction::SDiv: NumFastIselFailSDiv++; return; - case Instruction::FDiv: NumFastIselFailFDiv++; return; - case Instruction::URem: NumFastIselFailURem++; return; - case Instruction::SRem: NumFastIselFailSRem++; return; - case Instruction::FRem: NumFastIselFailFRem++; return; - - // Logical operators... - case Instruction::And: NumFastIselFailAnd++; return; - case Instruction::Or: NumFastIselFailOr++; return; - case Instruction::Xor: NumFastIselFailXor++; return; - - // Memory instructions... - case Instruction::Alloca: NumFastIselFailAlloca++; return; - case Instruction::Load: NumFastIselFailLoad++; return; - case Instruction::Store: NumFastIselFailStore++; return; - case Instruction::AtomicCmpXchg: NumFastIselFailAtomicCmpXchg++; return; - case Instruction::AtomicRMW: NumFastIselFailAtomicRMW++; return; - case Instruction::Fence: NumFastIselFailFence++; return; - case Instruction::GetElementPtr: NumFastIselFailGetElementPtr++; return; - - // Convert instructions... - case Instruction::Trunc: NumFastIselFailTrunc++; return; - case Instruction::ZExt: NumFastIselFailZExt++; return; - case Instruction::SExt: NumFastIselFailSExt++; return; - case Instruction::FPTrunc: NumFastIselFailFPTrunc++; return; - case Instruction::FPExt: NumFastIselFailFPExt++; return; - case Instruction::FPToUI: NumFastIselFailFPToUI++; return; - case Instruction::FPToSI: NumFastIselFailFPToSI++; return; - case Instruction::UIToFP: NumFastIselFailUIToFP++; return; - case Instruction::SIToFP: NumFastIselFailSIToFP++; return; - case Instruction::IntToPtr: NumFastIselFailIntToPtr++; return; - case Instruction::PtrToInt: NumFastIselFailPtrToInt++; return; - case Instruction::BitCast: NumFastIselFailBitCast++; return; - - // Other instructions... - case Instruction::ICmp: NumFastIselFailICmp++; return; - case Instruction::FCmp: NumFastIselFailFCmp++; return; - case Instruction::PHI: NumFastIselFailPHI++; return; - case Instruction::Select: NumFastIselFailSelect++; return; - case Instruction::Call: { - if (auto const *Intrinsic = dyn_cast<IntrinsicInst>(I)) { - switch (Intrinsic->getIntrinsicID()) { - default: - NumFastIselFailIntrinsicCall++; return; - case Intrinsic::sadd_with_overflow: - NumFastIselFailSAddWithOverflow++; return; - case Intrinsic::uadd_with_overflow: - NumFastIselFailUAddWithOverflow++; return; - case Intrinsic::ssub_with_overflow: - NumFastIselFailSSubWithOverflow++; return; - case Intrinsic::usub_with_overflow: - NumFastIselFailUSubWithOverflow++; return; - case Intrinsic::smul_with_overflow: - NumFastIselFailSMulWithOverflow++; return; - case Intrinsic::umul_with_overflow: - NumFastIselFailUMulWithOverflow++; return; - case Intrinsic::frameaddress: - NumFastIselFailFrameaddress++; return; - case Intrinsic::sqrt: - NumFastIselFailSqrt++; return; - case Intrinsic::experimental_stackmap: - NumFastIselFailStackMap++; return; - case Intrinsic::experimental_patchpoint_void: // fall-through - case Intrinsic::experimental_patchpoint_i64: - NumFastIselFailPatchPoint++; return; - } - } - NumFastIselFailCall++; - return; - } - case Instruction::Shl: NumFastIselFailShl++; return; - case Instruction::LShr: NumFastIselFailLShr++; return; - case Instruction::AShr: NumFastIselFailAShr++; return; - case Instruction::VAArg: NumFastIselFailVAArg++; return; - case Instruction::ExtractElement: NumFastIselFailExtractElement++; return; - case Instruction::InsertElement: NumFastIselFailInsertElement++; return; - case Instruction::ShuffleVector: NumFastIselFailShuffleVector++; return; - case Instruction::ExtractValue: NumFastIselFailExtractValue++; return; - case Instruction::InsertValue: NumFastIselFailInsertValue++; return; - case Instruction::LandingPad: NumFastIselFailLandingPad++; return; - } -} -#endif // NDEBUG - /// Set up SwiftErrorVals by going through the function. If the function has /// swifterror argument, it will be the first entry. static void setupSwiftErrorVals(const Function &Fn, const TargetLowering *TLI, @@ -1166,6 +1058,7 @@ static void setupSwiftErrorVals(const Function &Fn, const TargetLowering *TLI, FuncInfo->SwiftErrorVals.clear(); FuncInfo->SwiftErrorVRegDefMap.clear(); FuncInfo->SwiftErrorVRegUpwardsUse.clear(); + FuncInfo->SwiftErrorVRegDefUses.clear(); FuncInfo->SwiftErrorArg = nullptr; // Check if function has a swifterror argument. @@ -1190,9 +1083,9 @@ static void setupSwiftErrorVals(const Function &Fn, const TargetLowering *TLI, } static void createSwiftErrorEntriesInEntryBlock(FunctionLoweringInfo *FuncInfo, + FastISel *FastIS, const TargetLowering *TLI, const TargetInstrInfo *TII, - const BasicBlock *LLVMBB, SelectionDAGBuilder *SDB) { if (!TLI->supportSwiftError()) return; @@ -1202,21 +1095,71 @@ static void createSwiftErrorEntriesInEntryBlock(FunctionLoweringInfo *FuncInfo, if (FuncInfo->SwiftErrorVals.empty()) return; - if (pred_begin(LLVMBB) == pred_end(LLVMBB)) { - auto &DL = FuncInfo->MF->getDataLayout(); - auto const *RC = TLI->getRegClassFor(TLI->getPointerTy(DL)); - for (const auto *SwiftErrorVal : FuncInfo->SwiftErrorVals) { - // We will always generate a copy from the argument. It is always used at - // least by the 'return' of the swifterror. - if (FuncInfo->SwiftErrorArg && FuncInfo->SwiftErrorArg == SwiftErrorVal) + assert(FuncInfo->MBB == &*FuncInfo->MF->begin() && + "expected to insert into entry block"); + auto &DL = FuncInfo->MF->getDataLayout(); + auto const *RC = TLI->getRegClassFor(TLI->getPointerTy(DL)); + for (const auto *SwiftErrorVal : FuncInfo->SwiftErrorVals) { + // We will always generate a copy from the argument. It is always used at + // least by the 'return' of the swifterror. + if (FuncInfo->SwiftErrorArg && FuncInfo->SwiftErrorArg == SwiftErrorVal) + continue; + unsigned VReg = FuncInfo->MF->getRegInfo().createVirtualRegister(RC); + // Assign Undef to Vreg. We construct MI directly to make sure it works + // with FastISel. + BuildMI(*FuncInfo->MBB, FuncInfo->MBB->getFirstNonPHI(), + SDB->getCurDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF), + VReg); + + // Keep FastIS informed about the value we just inserted. + if (FastIS) + FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt)); + + FuncInfo->setCurrentSwiftErrorVReg(FuncInfo->MBB, SwiftErrorVal, VReg); + } +} + +/// Collect llvm.dbg.declare information. This is done after argument lowering +/// in case the declarations refer to arguments. +static void processDbgDeclares(FunctionLoweringInfo *FuncInfo) { + MachineFunction *MF = FuncInfo->MF; + const DataLayout &DL = MF->getDataLayout(); + for (const BasicBlock &BB : *FuncInfo->Fn) { + for (const Instruction &I : BB) { + const DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(&I); + if (!DI) + continue; + + assert(DI->getVariable() && "Missing variable"); + assert(DI->getDebugLoc() && "Missing location"); + const Value *Address = DI->getAddress(); + if (!Address) + continue; + + // Look through casts and constant offset GEPs. These mostly come from + // inalloca. + APInt Offset(DL.getPointerSizeInBits(0), 0); + Address = Address->stripAndAccumulateInBoundsConstantOffsets(DL, Offset); + + // Check if the variable is a static alloca or a byval or inalloca + // argument passed in memory. If it is not, then we will ignore this + // intrinsic and handle this during isel like dbg.value. + int FI = std::numeric_limits<int>::max(); + if (const auto *AI = dyn_cast<AllocaInst>(Address)) { + auto SI = FuncInfo->StaticAllocaMap.find(AI); + if (SI != FuncInfo->StaticAllocaMap.end()) + FI = SI->second; + } else if (const auto *Arg = dyn_cast<Argument>(Address)) + FI = FuncInfo->getArgumentFrameIndex(Arg); + + if (FI == std::numeric_limits<int>::max()) continue; - unsigned VReg = FuncInfo->MF->getRegInfo().createVirtualRegister(RC); - // Assign Undef to Vreg. We construct MI directly to make sure it works - // with FastISel. - BuildMI(*FuncInfo->MBB, FuncInfo->MBB->getFirstNonPHI(), - SDB->getCurDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF), - VReg); - FuncInfo->setCurrentSwiftErrorVReg(FuncInfo->MBB, SwiftErrorVal, VReg); + + DIExpression *Expr = DI->getExpression(); + if (Offset.getBoolValue()) + Expr = DIExpression::prepend(Expr, DIExpression::NoDeref, + Offset.getZExtValue()); + MF->setVariableDbgInfo(DI->getVariable(), Expr, FI, DI->getDebugLoc()); } } } @@ -1339,7 +1282,82 @@ static void propagateSwiftErrorVRegs(FunctionLoweringInfo *FuncInfo) { } } +void preassignSwiftErrorRegs(const TargetLowering *TLI, + FunctionLoweringInfo *FuncInfo, + BasicBlock::const_iterator Begin, + BasicBlock::const_iterator End) { + if (!TLI->supportSwiftError() || FuncInfo->SwiftErrorVals.empty()) + return; + + // Iterator over instructions and assign vregs to swifterror defs and uses. + for (auto It = Begin; It != End; ++It) { + ImmutableCallSite CS(&*It); + if (CS) { + // A call-site with a swifterror argument is both use and def. + const Value *SwiftErrorAddr = nullptr; + for (auto &Arg : CS.args()) { + if (!Arg->isSwiftError()) + continue; + // Use of swifterror. + assert(!SwiftErrorAddr && "Cannot have multiple swifterror arguments"); + SwiftErrorAddr = &*Arg; + assert(SwiftErrorAddr->isSwiftError() && + "Must have a swifterror value argument"); + unsigned VReg; bool CreatedReg; + std::tie(VReg, CreatedReg) = FuncInfo->getOrCreateSwiftErrorVRegUseAt( + &*It, FuncInfo->MBB, SwiftErrorAddr); + assert(CreatedReg); + } + if (!SwiftErrorAddr) + continue; + + // Def of swifterror. + unsigned VReg; bool CreatedReg; + std::tie(VReg, CreatedReg) = + FuncInfo->getOrCreateSwiftErrorVRegDefAt(&*It); + assert(CreatedReg); + FuncInfo->setCurrentSwiftErrorVReg(FuncInfo->MBB, SwiftErrorAddr, VReg); + + // A load is a use. + } else if (const LoadInst *LI = dyn_cast<const LoadInst>(&*It)) { + const Value *V = LI->getOperand(0); + if (!V->isSwiftError()) + continue; + + unsigned VReg; bool CreatedReg; + std::tie(VReg, CreatedReg) = + FuncInfo->getOrCreateSwiftErrorVRegUseAt(LI, FuncInfo->MBB, V); + assert(CreatedReg); + + // A store is a def. + } else if (const StoreInst *SI = dyn_cast<const StoreInst>(&*It)) { + const Value *SwiftErrorAddr = SI->getOperand(1); + if (!SwiftErrorAddr->isSwiftError()) + continue; + + // Def of swifterror. + unsigned VReg; bool CreatedReg; + std::tie(VReg, CreatedReg) = + FuncInfo->getOrCreateSwiftErrorVRegDefAt(&*It); + assert(CreatedReg); + FuncInfo->setCurrentSwiftErrorVReg(FuncInfo->MBB, SwiftErrorAddr, VReg); + + // A return in a swiferror returning function is a use. + } else if (const ReturnInst *R = dyn_cast<const ReturnInst>(&*It)) { + const Function *F = R->getParent()->getParent(); + if(!F->getAttributes().hasAttrSomewhere(Attribute::SwiftError)) + continue; + + unsigned VReg; bool CreatedReg; + std::tie(VReg, CreatedReg) = FuncInfo->getOrCreateSwiftErrorVRegUseAt( + R, FuncInfo->MBB, FuncInfo->SwiftErrorArg); + assert(CreatedReg); + } + } +} + void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { + FastISelFailed = false; // Initialize the Fast-ISel state, if needed. FastISel *FastIS = nullptr; if (TM.Options.EnableFastISel) @@ -1347,12 +1365,55 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { setupSwiftErrorVals(Fn, TLI, FuncInfo); - // Iterate over all basic blocks in the function. ReversePostOrderTraversal<const Function*> RPOT(&Fn); - for (ReversePostOrderTraversal<const Function*>::rpo_iterator - I = RPOT.begin(), E = RPOT.end(); I != E; ++I) { - const BasicBlock *LLVMBB = *I; + // Lower arguments up front. An RPO iteration always visits the entry block + // first. + assert(*RPOT.begin() == &Fn.getEntryBlock()); + ++NumEntryBlocks; + + // Set up FuncInfo for ISel. Entry blocks never have PHIs. + FuncInfo->MBB = FuncInfo->MBBMap[&Fn.getEntryBlock()]; + FuncInfo->InsertPt = FuncInfo->MBB->begin(); + + if (!FastIS) { + LowerArguments(Fn); + } else { + // See if fast isel can lower the arguments. + FastIS->startNewBlock(); + if (!FastIS->lowerArguments()) { + FastISelFailed = true; + // Fast isel failed to lower these arguments + ++NumFastIselFailLowerArguments; + + OptimizationRemarkMissed R("sdagisel", "FastISelFailure", + Fn.getSubprogram(), + &Fn.getEntryBlock()); + R << "FastISel didn't lower all arguments: " + << ore::NV("Prototype", Fn.getType()); + reportFastISelFailure(*MF, *ORE, R, EnableFastISelAbort > 1); + + // Use SelectionDAG argument lowering + LowerArguments(Fn); + CurDAG->setRoot(SDB->getControlRoot()); + SDB->clear(); + CodeGenAndEmitDAG(); + } + + // If we inserted any instructions at the beginning, make a note of + // where they are, so we can be sure to emit subsequent instructions + // after them. + if (FuncInfo->InsertPt != FuncInfo->MBB->begin()) + FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt)); + else + FastIS->setLastLocalValue(nullptr); + } + createSwiftErrorEntriesInEntryBlock(FuncInfo, FastIS, TLI, TII, SDB); + + processDbgDeclares(FuncInfo); + + // Iterate over all basic blocks in the function. + for (const BasicBlock *LLVMBB : RPOT) { if (OptLevel != CodeGenOpt::None) { bool AllPredsVisited = true; for (const_pred_iterator PI = pred_begin(LLVMBB), PE = pred_end(LLVMBB); @@ -1384,8 +1445,9 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { FuncInfo->MBB = FuncInfo->MBBMap[LLVMBB]; if (!FuncInfo->MBB) continue; // Some blocks like catchpads have no code or MBB. - FuncInfo->InsertPt = FuncInfo->MBB->getFirstNonPHI(); - createSwiftErrorEntriesInEntryBlock(FuncInfo, TLI, TII, LLVMBB, SDB); + + // Insert new instructions after any phi or argument setup code. + FuncInfo->InsertPt = FuncInfo->MBB->end(); // Setup an EH landing-pad block. FuncInfo->ExceptionPointerVirtReg = 0; @@ -1396,43 +1458,21 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { // Before doing SelectionDAG ISel, see if FastISel has been requested. if (FastIS) { - FastIS->startNewBlock(); - - // Emit code for any incoming arguments. This must happen before - // beginning FastISel on the entry block. - if (LLVMBB == &Fn.getEntryBlock()) { - ++NumEntryBlocks; - - // Lower any arguments needed in this block if this is the entry block. - if (!FastIS->lowerArguments()) { - // Fast isel failed to lower these arguments - ++NumFastIselFailLowerArguments; - if (EnableFastISelAbort > 1) - report_fatal_error("FastISel didn't lower all arguments"); - - // Use SelectionDAG argument lowering - LowerArguments(Fn); - CurDAG->setRoot(SDB->getControlRoot()); - SDB->clear(); - CodeGenAndEmitDAG(); - } - - // If we inserted any instructions at the beginning, make a note of - // where they are, so we can be sure to emit subsequent instructions - // after them. - if (FuncInfo->InsertPt != FuncInfo->MBB->begin()) - FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt)); - else - FastIS->setLastLocalValue(nullptr); - } + if (LLVMBB != &Fn.getEntryBlock()) + FastIS->startNewBlock(); unsigned NumFastIselRemaining = std::distance(Begin, End); + + // Pre-assign swifterror vregs. + preassignSwiftErrorRegs(TLI, FuncInfo, Begin, End); + // Do FastISel on as many instructions as possible. for (; BI != Begin; --BI) { const Instruction *Inst = &*std::prev(BI); // If we no longer require this instruction, skip it. - if (isFoldedOrDeadInstruction(Inst, FuncInfo)) { + if (isFoldedOrDeadInstruction(Inst, FuncInfo) || + ElidedArgCopyInstrs.count(Inst)) { --NumFastIselRemaining; continue; } @@ -1465,22 +1505,28 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { continue; } -#ifndef NDEBUG - if (EnableFastISelVerbose2) - collectFailStats(Inst); -#endif + FastISelFailed = true; // Then handle certain instructions as single-LLVM-Instruction blocks. - if (isa<CallInst>(Inst)) { - - if (EnableFastISelVerbose || EnableFastISelAbort) { - dbgs() << "FastISel missed call: "; - Inst->dump(); + // We cannot separate out GCrelocates to their own blocks since we need + // to keep track of gc-relocates for a particular gc-statepoint. This is + // done by SelectionDAGBuilder::LowerAsSTATEPOINT, called before + // visitGCRelocate. + if (isa<CallInst>(Inst) && !isStatepoint(Inst) && !isGCRelocate(Inst)) { + OptimizationRemarkMissed R("sdagisel", "FastISelFailure", + Inst->getDebugLoc(), LLVMBB); + + R << "FastISel missed call"; + + if (R.isEnabled() || EnableFastISelAbort) { + std::string InstStrStorage; + raw_string_ostream InstStr(InstStrStorage); + InstStr << *Inst; + + R << ": " << InstStr.str(); } - if (EnableFastISelAbort > 2) - // FastISel selector couldn't handle something and bailed. - // For the purpose of debugging, just abort. - report_fatal_error("FastISel didn't select the entire block"); + + reportFastISelFailure(*MF, *ORE, R, EnableFastISelAbort > 2); if (!Inst->getType()->isVoidTy() && !Inst->getType()->isTokenTy() && !Inst->use_empty()) { @@ -1509,35 +1555,35 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { continue; } + OptimizationRemarkMissed R("sdagisel", "FastISelFailure", + Inst->getDebugLoc(), LLVMBB); + bool ShouldAbort = EnableFastISelAbort; - if (EnableFastISelVerbose || EnableFastISelAbort) { - if (isa<TerminatorInst>(Inst)) { - // Use a different message for terminator misses. - dbgs() << "FastISel missed terminator: "; - // Don't abort unless for terminator unless the level is really high - ShouldAbort = (EnableFastISelAbort > 2); - } else { - dbgs() << "FastISel miss: "; - } - Inst->dump(); + if (isa<TerminatorInst>(Inst)) { + // Use a different message for terminator misses. + R << "FastISel missed terminator"; + // Don't abort for terminator unless the level is really high + ShouldAbort = (EnableFastISelAbort > 2); + } else { + R << "FastISel missed"; } - if (ShouldAbort) - // FastISel selector couldn't handle something and bailed. - // For the purpose of debugging, just abort. - report_fatal_error("FastISel didn't select the entire block"); + + if (R.isEnabled() || EnableFastISelAbort) { + std::string InstStrStorage; + raw_string_ostream InstStr(InstStrStorage); + InstStr << *Inst; + R << ": " << InstStr.str(); + } + + reportFastISelFailure(*MF, *ORE, R, ShouldAbort); NumFastIselFailures += NumFastIselRemaining; break; } FastIS->recomputeInsertPt(); - } else { - // Lower any arguments needed in this block if this is the entry block. - if (LLVMBB == &Fn.getEntryBlock()) { - ++NumEntryBlocks; - LowerArguments(Fn); - } } + if (getAnalysis<StackProtector>().shouldEmitSDCheck(*LLVMBB)) { bool FunctionBasedInstrumentation = TLI->getSSPStackGuardCheck(*Fn.getParent()); @@ -1556,10 +1602,17 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { // block. bool HadTailCall; SelectBasicBlock(Begin, BI, HadTailCall); + + // But if FastISel was run, we already selected some of the block. + // If we emitted a tail-call, we need to delete any previously emitted + // instruction that follows it. + if (HadTailCall && FuncInfo->InsertPt != FuncInfo->MBB->end()) + FastIS->removeDeadCode(FuncInfo->InsertPt, FuncInfo->MBB->end()); } FinishBasicBlock(); FuncInfo->PHINodesToUpdate.clear(); + ElidedArgCopyInstrs.clear(); } propagateSwiftErrorVRegs(FuncInfo); @@ -1975,11 +2028,11 @@ bool SelectionDAGISel::CheckOrMask(SDValue LHS, ConstantSDNode *RHS, // either already zero or is not demanded. Check for known zero input bits. APInt NeededMask = DesiredMask & ~ActualMask; - APInt KnownZero, KnownOne; - CurDAG->computeKnownBits(LHS, KnownZero, KnownOne); + KnownBits Known; + CurDAG->computeKnownBits(LHS, Known); // If all the missing bits in the or are already known to be set, match! - if ((NeededMask & KnownOne) == NeededMask) + if (NeededMask.isSubsetOf(Known.One)) return true; // TODO: check to see if missing bits are just not demanded. @@ -2062,7 +2115,7 @@ static SDNode *findGlueUse(SDNode *N) { } /// findNonImmUse - Return true if "Use" is a non-immediate use of "Def". -/// This function recursively traverses up the operand chain, ignoring +/// This function iteratively traverses up the operand chain, ignoring /// certain nodes. static bool findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse, SDNode *Root, SmallPtrSetImpl<SDNode*> &Visited, @@ -2075,30 +2128,36 @@ static bool findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse, // The Use may be -1 (unassigned) if it is a newly allocated node. This can // happen because we scan down to newly selected nodes in the case of glue // uses. - if ((Use->getNodeId() < Def->getNodeId() && Use->getNodeId() != -1)) - return false; + std::vector<SDNode *> WorkList; + WorkList.push_back(Use); - // Don't revisit nodes if we already scanned it and didn't fail, we know we - // won't fail if we scan it again. - if (!Visited.insert(Use).second) - return false; + while (!WorkList.empty()) { + Use = WorkList.back(); + WorkList.pop_back(); + if (Use->getNodeId() < Def->getNodeId() && Use->getNodeId() != -1) + continue; - for (const SDValue &Op : Use->op_values()) { - // Ignore chain uses, they are validated by HandleMergeInputChains. - if (Op.getValueType() == MVT::Other && IgnoreChains) + // Don't revisit nodes if we already scanned it and didn't fail, we know we + // won't fail if we scan it again. + if (!Visited.insert(Use).second) continue; - SDNode *N = Op.getNode(); - if (N == Def) { - if (Use == ImmedUse || Use == Root) - continue; // We are not looking for immediate use. - assert(N != Root); - return true; - } + for (const SDValue &Op : Use->op_values()) { + // Ignore chain uses, they are validated by HandleMergeInputChains. + if (Op.getValueType() == MVT::Other && IgnoreChains) + continue; - // Traverse up the operand chain. - if (findNonImmUse(N, Def, ImmedUse, Root, Visited, IgnoreChains)) - return true; + SDNode *N = Op.getNode(); + if (N == Def) { + if (Use == ImmedUse || Use == Root) + continue; // We are not looking for immediate use. + assert(N != Root); + return true; + } + + // Traverse up the operand chain. + WorkList.push_back(N); + } } return false; } @@ -2177,7 +2236,6 @@ bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root, IgnoreChains = false; } - SmallPtrSet<SDNode*, 16> Visited; return !findNonImmUse(Root, N.getNode(), U, Root, Visited, IgnoreChains); } @@ -2554,7 +2612,7 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList, LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N, - const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes) { + const SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes) { // Accept if it is exactly the same as a previously recorded node. unsigned RecNo = MatcherTable[MatcherIndex++]; assert(RecNo < RecordedNodes.size() && "Invalid CheckSame"); @@ -2564,9 +2622,9 @@ CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex, /// CheckChildSame - Implements OP_CheckChildXSame. LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool CheckChildSame(const unsigned char *MatcherTable, unsigned &MatcherIndex, - SDValue N, - const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes, - unsigned ChildNo) { + SDValue N, + const SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes, + unsigned ChildNo) { if (ChildNo >= N.getNumOperands()) return false; // Match fails if out of range child #. return ::CheckSame(MatcherTable, MatcherIndex, N.getOperand(ChildNo), @@ -2688,7 +2746,7 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table, unsigned Index, SDValue N, bool &Result, const SelectionDAGISel &SDISel, - SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes) { + SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes) { switch (Table[Index++]) { default: Result = false; @@ -2756,6 +2814,7 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table, } namespace { + struct MatchScope { /// FailIndex - If this match fails, this is the index to continue with. unsigned FailIndex; @@ -2785,6 +2844,7 @@ class MatchStateUpdater : public SelectionDAG::DAGUpdateListener SDNode **NodeToMatch; SmallVectorImpl<std::pair<SDValue, SDNode *>> &RecordedNodes; SmallVectorImpl<MatchScope> &MatchScopes; + public: MatchStateUpdater(SelectionDAG &DAG, SDNode **NodeToMatch, SmallVectorImpl<std::pair<SDValue, SDNode *>> &RN, @@ -2816,6 +2876,7 @@ public: J.setNode(E); } }; + } // end anonymous namespace void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, @@ -2921,7 +2982,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, // with an OPC_SwitchOpcode instruction. Populate the table now, since this // is the first time we're selecting an instruction. unsigned Idx = 1; - while (1) { + while (true) { // Get the size of this case. unsigned CaseSize = MatcherTable[Idx++]; if (CaseSize & 128) @@ -2942,7 +3003,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, MatcherIndex = OpcodeOffset[N.getOpcode()]; } - while (1) { + while (true) { assert(MatcherIndex < TableSize && "Invalid index"); #ifndef NDEBUG unsigned CurrentOpcodeIndex = MatcherIndex; @@ -2957,7 +3018,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, // immediately fail, don't even bother pushing a scope for them. unsigned FailIndex; - while (1) { + while (true) { unsigned NumToSkip = MatcherTable[MatcherIndex++]; if (NumToSkip & 128) NumToSkip = GetVBR(NumToSkip, MatcherTable, MatcherIndex); @@ -3118,7 +3179,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, unsigned CurNodeOpcode = N.getOpcode(); unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart; unsigned CaseSize; - while (1) { + while (true) { // Get the size of this case. CaseSize = MatcherTable[MatcherIndex++]; if (CaseSize & 128) @@ -3149,7 +3210,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, MVT CurNodeVT = N.getSimpleValueType(); unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart; unsigned CaseSize; - while (1) { + while (true) { // Get the size of this case. CaseSize = MatcherTable[MatcherIndex++]; if (CaseSize & 128) @@ -3215,7 +3276,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, // a single use. bool HasMultipleUses = false; for (unsigned i = 1, e = NodeStack.size()-1; i != e; ++i) - if (!NodeStack[i].hasOneUse()) { + if (!NodeStack[i].getNode()->hasOneUse()) { HasMultipleUses = true; break; } @@ -3381,6 +3442,15 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, RecordedNodes.push_back(std::pair<SDValue,SDNode*>(Res, nullptr)); continue; } + case OPC_Coverage: { + // This is emitted right before MorphNode/EmitNode. + // So it should be safe to assume that this node has been selected + unsigned index = MatcherTable[MatcherIndex++]; + index |= (MatcherTable[MatcherIndex++] << 8); + dbgs() << "COVERED: " << getPatternForIndex(index) << "\n"; + dbgs() << "INCLUDED: " << getIncludePathForIndex(index) << "\n"; + continue; + } case OPC_EmitNode: case OPC_MorphNodeTo: case OPC_EmitNode0: case OPC_EmitNode1: case OPC_EmitNode2: @@ -3473,7 +3543,6 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, RecordedNodes.push_back(std::pair<SDValue,SDNode*>(SDValue(Res, i), nullptr)); } - } else { assert(NodeToMatch->getOpcode() != ISD::DELETED_NODE && "NodeToMatch was removed partway through selection"); @@ -3610,7 +3679,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, // find a case to check. DEBUG(dbgs() << " Match failed at index " << CurrentOpcodeIndex << "\n"); ++NumDAGIselRetries; - while (1) { + while (true) { if (MatchScopes.empty()) { CannotYetSelect(NodeToMatch); return; diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp index 2764688..11561df 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp @@ -11,13 +11,13 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/SelectionDAG.h" #include "ScheduleDAGSDNodes.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/StringExtras.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfo.h" #include "llvm/Support/Debug.h" diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp index d27e245..5d78bba 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp @@ -17,9 +17,9 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/GCMetadata.h" #include "llvm/CodeGen/GCStrategy.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/CallingConv.h" @@ -110,8 +110,8 @@ StatepointLoweringState::allocateStackSlot(EVT ValueType, Builder.FuncInfo.StatepointStackSlots.size() && "Broken invariant"); - StatepointMaxSlotsRequired = std::max<unsigned long>( - StatepointMaxSlotsRequired, Builder.FuncInfo.StatepointStackSlots.size()); + StatepointMaxSlotsRequired.updateMax( + Builder.FuncInfo.StatepointStackSlots.size()); return SpillSlot; } @@ -242,7 +242,8 @@ static void reservePreviousStackSlotForValue(const Value *IncomingValue, // Cache this slot so we find it when going through the normal // assignment loop. - SDValue Loc = Builder.DAG.getTargetFrameIndex(*Index, Incoming.getValueType()); + SDValue Loc = + Builder.DAG.getTargetFrameIndex(*Index, Builder.getFrameIndexTy()); Builder.StatepointLowering.setLocation(Incoming, Loc); } @@ -343,7 +344,7 @@ spillIncomingStatepointValue(SDValue Incoming, SDValue Chain, Builder); int Index = cast<FrameIndexSDNode>(Loc)->getIndex(); // We use TargetFrameIndex so that isel will not select it into LEA - Loc = Builder.DAG.getTargetFrameIndex(Index, Incoming.getValueType()); + Loc = Builder.DAG.getTargetFrameIndex(Index, Builder.getFrameIndexTy()); // TODO: We can create TokenFactor node instead of // chaining stores one after another, this may allow @@ -391,8 +392,10 @@ static void lowerIncomingStatepointValue(SDValue Incoming, bool LiveInOnly, // This handles allocas as arguments to the statepoint (this is only // really meaningful for a deopt value. For GC, we'd be trying to // relocate the address of the alloca itself?) + assert(Incoming.getValueType() == Builder.getFrameIndexTy() && + "Incoming value is a frame index!"); Ops.push_back(Builder.DAG.getTargetFrameIndex(FI->getIndex(), - Incoming.getValueType())); + Builder.getFrameIndexTy())); } else if (LiveInOnly) { // If this value is live in (not live-on-return, or live-through), we can // treat it the same way patchpoint treats it's "live in" values. We'll @@ -527,8 +530,10 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops, SDValue Incoming = Builder.getValue(V); if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Incoming)) { // This handles allocas as arguments to the statepoint + assert(Incoming.getValueType() == Builder.getFrameIndexTy() && + "Incoming value is a frame index!"); Ops.push_back(Builder.DAG.getTargetFrameIndex(FI->getIndex(), - Incoming.getValueType())); + Builder.getFrameIndexTy())); } } @@ -813,7 +818,7 @@ SelectionDAGBuilder::LowerStatepoint(ImmutableStatepoint ISP, SI.GCTransitionArgs = ArrayRef<const Use>(ISP.gc_args_begin(), ISP.gc_args_end()); SI.ID = ISP.getID(); - SI.DeoptState = ArrayRef<const Use>(ISP.vm_state_begin(), ISP.vm_state_end()); + SI.DeoptState = ArrayRef<const Use>(ISP.deopt_begin(), ISP.deopt_end()); SI.StatepointFlags = ISP.getFlags(); SI.NumPatchBytes = ISP.getNumPatchBytes(); SI.EHPadBB = EHPadBB; @@ -835,7 +840,7 @@ SelectionDAGBuilder::LowerStatepoint(ImmutableStatepoint ISP, // completely and make statepoint call to return a tuple. unsigned Reg = FuncInfo.CreateRegs(RetTy); RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(), - DAG.getDataLayout(), Reg, RetTy); + DAG.getDataLayout(), Reg, RetTy, true); SDValue Chain = DAG.getEntryNode(); RFV.getCopyToRegs(ReturnValue, DAG, getCurSDLoc(), Chain, nullptr); @@ -949,8 +954,8 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) { return; } - SDValue SpillSlot = DAG.getTargetFrameIndex(*DerivedPtrLocation, - SD.getValueType()); + SDValue SpillSlot = + DAG.getTargetFrameIndex(*DerivedPtrLocation, getFrameIndexTy()); // Be conservative: flush all pending loads // TODO: Probably we can be less restrictive on this, @@ -958,7 +963,9 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) { SDValue Chain = getRoot(); SDValue SpillLoad = - DAG.getLoad(SpillSlot.getValueType(), getCurSDLoc(), Chain, SpillSlot, + DAG.getLoad(DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(), + Relocate.getType()), + getCurSDLoc(), Chain, SpillSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), *DerivedPtrLocation)); diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 690f0d2..8652df7 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -27,6 +27,7 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" @@ -55,14 +56,15 @@ bool TargetLowering::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node, // Conservatively require the attributes of the call to match those of // the return. Ignore noalias because it doesn't affect the call sequence. - AttributeSet CallerAttrs = F->getAttributes(); - if (AttrBuilder(CallerAttrs, AttributeSet::ReturnIndex) - .removeAttribute(Attribute::NoAlias).hasAttributes()) + AttributeList CallerAttrs = F->getAttributes(); + if (AttrBuilder(CallerAttrs, AttributeList::ReturnIndex) + .removeAttribute(Attribute::NoAlias) + .hasAttributes()) return false; // It's not safe to eliminate the sign / zero extension of the return value. - if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) || - CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt)) + if (CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt) || + CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt)) return false; // Check if the only use is a function return node. @@ -96,19 +98,19 @@ bool TargetLowering::parametersInCSRMatch(const MachineRegisterInfo &MRI, /// \brief Set CallLoweringInfo attribute flags based on a call instruction /// and called function attributes. -void TargetLowering::ArgListEntry::setAttributes(ImmutableCallSite *CS, - unsigned AttrIdx) { - isSExt = CS->paramHasAttr(AttrIdx, Attribute::SExt); - isZExt = CS->paramHasAttr(AttrIdx, Attribute::ZExt); - isInReg = CS->paramHasAttr(AttrIdx, Attribute::InReg); - isSRet = CS->paramHasAttr(AttrIdx, Attribute::StructRet); - isNest = CS->paramHasAttr(AttrIdx, Attribute::Nest); - isByVal = CS->paramHasAttr(AttrIdx, Attribute::ByVal); - isInAlloca = CS->paramHasAttr(AttrIdx, Attribute::InAlloca); - isReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned); - isSwiftSelf = CS->paramHasAttr(AttrIdx, Attribute::SwiftSelf); - isSwiftError = CS->paramHasAttr(AttrIdx, Attribute::SwiftError); - Alignment = CS->getParamAlignment(AttrIdx); +void TargetLoweringBase::ArgListEntry::setAttributes(ImmutableCallSite *CS, + unsigned ArgIdx) { + IsSExt = CS->paramHasAttr(ArgIdx, Attribute::SExt); + IsZExt = CS->paramHasAttr(ArgIdx, Attribute::ZExt); + IsInReg = CS->paramHasAttr(ArgIdx, Attribute::InReg); + IsSRet = CS->paramHasAttr(ArgIdx, Attribute::StructRet); + IsNest = CS->paramHasAttr(ArgIdx, Attribute::Nest); + IsByVal = CS->paramHasAttr(ArgIdx, Attribute::ByVal); + IsInAlloca = CS->paramHasAttr(ArgIdx, Attribute::InAlloca); + IsReturned = CS->paramHasAttr(ArgIdx, Attribute::Returned); + IsSwiftSelf = CS->paramHasAttr(ArgIdx, Attribute::SwiftSelf); + IsSwiftError = CS->paramHasAttr(ArgIdx, Attribute::SwiftError); + Alignment = CS->getParamAlignment(ArgIdx); } /// Generate a libcall taking the given operands as arguments and returning a @@ -125,8 +127,8 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, for (SDValue Op : Ops) { Entry.Node = Op; Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext()); - Entry.isSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned); - Entry.isZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned); + Entry.IsSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned); + Entry.IsZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned); Args.push_back(Entry); } @@ -138,10 +140,13 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext()); TargetLowering::CallLoweringInfo CLI(DAG); bool signExtend = shouldSignExtendTypeInLibCall(RetVT, isSigned); - CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) - .setNoReturn(doesNotReturn).setDiscardResult(!isReturnValueUsed) - .setSExtResult(signExtend).setZExtResult(!signExtend); + CLI.setDebugLoc(dl) + .setChain(DAG.getEntryNode()) + .setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) + .setNoReturn(doesNotReturn) + .setDiscardResult(!isReturnValueUsed) + .setSExtResult(signExtend) + .setZExtResult(!signExtend); return LowerCallTo(CLI); } @@ -334,34 +339,40 @@ TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // Optimization Methods //===----------------------------------------------------------------------===// -/// Check to see if the specified operand of the specified instruction is a -/// constant integer. If so, check to see if there are any bits set in the -/// constant that are not demanded. If so, shrink the constant and return true. -bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op, - const APInt &Demanded) { - SDLoc dl(Op); +/// If the specified instruction has a constant integer operand and there are +/// bits set in that constant that are not demanded, then clear those bits and +/// return true. +bool TargetLowering::ShrinkDemandedConstant(SDValue Op, const APInt &Demanded, + TargetLoweringOpt &TLO) const { + SelectionDAG &DAG = TLO.DAG; + SDLoc DL(Op); + unsigned Opcode = Op.getOpcode(); + + // Do target-specific constant optimization. + if (targetShrinkDemandedConstant(Op, Demanded, TLO)) + return TLO.New.getNode(); // FIXME: ISD::SELECT, ISD::SELECT_CC - switch (Op.getOpcode()) { - default: break; + switch (Opcode) { + default: + break; case ISD::XOR: case ISD::AND: case ISD::OR: { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); - if (!C) return false; + auto *Op1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + if (!Op1C) + return false; - if (Op.getOpcode() == ISD::XOR && - (C->getAPIntValue() | (~Demanded)).isAllOnesValue()) + // If this is a 'not' op, don't touch it because that's a canonical form. + const APInt &C = Op1C->getAPIntValue(); + if (Opcode == ISD::XOR && Demanded.isSubsetOf(C)) return false; - // if we can expand it to have all bits set, do it - if (C->getAPIntValue().intersects(~Demanded)) { + if (!C.isSubsetOf(Demanded)) { EVT VT = Op.getValueType(); - SDValue New = DAG.getNode(Op.getOpcode(), dl, VT, Op.getOperand(0), - DAG.getConstant(Demanded & - C->getAPIntValue(), - dl, VT)); - return CombineTo(Op, New); + SDValue NewC = DAG.getConstant(Demanded & C, DL, VT); + SDValue NewOp = DAG.getNode(Opcode, DL, VT, Op.getOperand(0), NewC); + return TLO.CombineTo(Op, NewOp); } break; @@ -374,15 +385,17 @@ bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op, /// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free. /// This uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be /// generalized for targets with other types of implicit widening casts. -bool TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op, - unsigned BitWidth, - const APInt &Demanded, - const SDLoc &dl) { +bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth, + const APInt &Demanded, + TargetLoweringOpt &TLO) const { assert(Op.getNumOperands() == 2 && "ShrinkDemandedOp only supports binary operators!"); assert(Op.getNode()->getNumValues() == 1 && "ShrinkDemandedOp only supports nodes with one result!"); + SelectionDAG &DAG = TLO.DAG; + SDLoc dl(Op); + // Early return, as this function cannot handle vector types. if (Op.getValueType().isVector()) return false; @@ -404,31 +417,28 @@ bool TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op, if (TLI.isTruncateFree(Op.getValueType(), SmallVT) && TLI.isZExtFree(SmallVT, Op.getValueType())) { // We found a type with free casts. - SDValue X = DAG.getNode(Op.getOpcode(), dl, SmallVT, - DAG.getNode(ISD::TRUNCATE, dl, SmallVT, - Op.getNode()->getOperand(0)), - DAG.getNode(ISD::TRUNCATE, dl, SmallVT, - Op.getNode()->getOperand(1))); + SDValue X = DAG.getNode( + Op.getOpcode(), dl, SmallVT, + DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(0)), + DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(1))); bool NeedZext = DemandedSize > SmallVTBits; SDValue Z = DAG.getNode(NeedZext ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND, dl, Op.getValueType(), X); - return CombineTo(Op, Z); + return TLO.CombineTo(Op, Z); } } return false; } bool -TargetLowering::TargetLoweringOpt::SimplifyDemandedBits(SDNode *User, - unsigned OpIdx, - const APInt &Demanded, - DAGCombinerInfo &DCI) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); +TargetLowering::SimplifyDemandedBits(SDNode *User, unsigned OpIdx, + const APInt &Demanded, + DAGCombinerInfo &DCI, + TargetLoweringOpt &TLO) const { SDValue Op = User->getOperand(OpIdx); - APInt KnownZero, KnownOne; + KnownBits Known; - if (!TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, - *this, 0, true)) + if (!SimplifyDemandedBits(Op, Demanded, Known, TLO, 0, true)) return false; @@ -440,9 +450,9 @@ TargetLowering::TargetLoweringOpt::SimplifyDemandedBits(SDNode *User, // with the value 'x', which will give us: // Old = i32 and x, 0xffffff // New = x - if (Old.hasOneUse()) { + if (TLO.Old.hasOneUse()) { // For the one use case, we just commit the change. - DCI.CommitTargetLoweringOpt(*this); + DCI.CommitTargetLoweringOpt(TLO); return true; } @@ -450,17 +460,17 @@ TargetLowering::TargetLoweringOpt::SimplifyDemandedBits(SDNode *User, // AssumeSingleUse flag is not propogated to recursive calls of // SimplifyDemanded bits, so the only node with multiple use that // it will attempt to combine will be opt. - assert(Old == Op); + assert(TLO.Old == Op); SmallVector <SDValue, 4> NewOps; for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) { if (i == OpIdx) { - NewOps.push_back(New); + NewOps.push_back(TLO.New); continue; } NewOps.push_back(User->getOperand(i)); } - DAG.UpdateNodeOperands(User, NewOps); + TLO.DAG.UpdateNodeOperands(User, NewOps); // Op has less users now, so we may be able to perform additional combines // with it. DCI.AddToWorklist(Op.getNode()); @@ -470,17 +480,30 @@ TargetLowering::TargetLoweringOpt::SimplifyDemandedBits(SDNode *User, return true; } +bool TargetLowering::SimplifyDemandedBits(SDValue Op, APInt &DemandedMask, + DAGCombinerInfo &DCI) const { + + SelectionDAG &DAG = DCI.DAG; + TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + KnownBits Known; + + bool Simplified = SimplifyDemandedBits(Op, DemandedMask, Known, TLO); + if (Simplified) + DCI.CommitTargetLoweringOpt(TLO); + return Simplified; +} + /// Look at Op. At this point, we know that only the DemandedMask bits of the /// result of Op are ever used downstream. If we can use this information to /// simplify Op, create a new simplified DAG node and return true, returning the /// original and new nodes in Old and New. Otherwise, analyze the expression and -/// return a mask of KnownOne and KnownZero bits for the expression (used to -/// simplify the caller). The KnownZero/One bits may only be accurate for those -/// bits in the DemandedMask. +/// return a mask of Known bits for the expression (used to simplify the +/// caller). The Known bits may only be accurate for those bits in the +/// DemandedMask. bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedMask, - APInt &KnownZero, - APInt &KnownOne, + KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth, bool AssumeSingleUse) const { @@ -492,14 +515,14 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, auto &DL = TLO.DAG.getDataLayout(); // Don't know anything. - KnownZero = KnownOne = APInt(BitWidth, 0); + Known = KnownBits(BitWidth); // Other users may use these bits. if (!Op.getNode()->hasOneUse() && !AssumeSingleUse) { if (Depth != 0) { - // If not at the root, Just compute the KnownZero/KnownOne bits to + // If not at the root, Just compute the Known bits to // simplify things downstream. - TLO.DAG.computeKnownBits(Op, KnownZero, KnownOne, Depth); + TLO.DAG.computeKnownBits(Op, Known, Depth); return false; } // If this is the root being simplified, allow it to have multiple uses, @@ -514,38 +537,36 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, return false; } - APInt KnownZero2, KnownOne2, KnownZeroOut, KnownOneOut; + KnownBits Known2, KnownOut; switch (Op.getOpcode()) { case ISD::Constant: // We know all of the bits for a constant! - KnownOne = cast<ConstantSDNode>(Op)->getAPIntValue(); - KnownZero = ~KnownOne; + Known.One = cast<ConstantSDNode>(Op)->getAPIntValue(); + Known.Zero = ~Known.One; return false; // Don't fall through, will infinitely loop. case ISD::BUILD_VECTOR: // Collect the known bits that are shared by every constant vector element. - KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth); + Known.Zero.setAllBits(); Known.One.setAllBits(); for (SDValue SrcOp : Op->ops()) { if (!isa<ConstantSDNode>(SrcOp)) { // We can only handle all constant values - bail out with no known bits. - KnownZero = KnownOne = APInt(BitWidth, 0); + Known = KnownBits(BitWidth); return false; } - KnownOne2 = cast<ConstantSDNode>(SrcOp)->getAPIntValue(); - KnownZero2 = ~KnownOne2; + Known2.One = cast<ConstantSDNode>(SrcOp)->getAPIntValue(); + Known2.Zero = ~Known2.One; // BUILD_VECTOR can implicitly truncate sources, we must handle this. - if (KnownOne2.getBitWidth() != BitWidth) { - assert(KnownOne2.getBitWidth() > BitWidth && - KnownZero2.getBitWidth() > BitWidth && + if (Known2.One.getBitWidth() != BitWidth) { + assert(Known2.getBitWidth() > BitWidth && "Expected BUILD_VECTOR implicit truncation"); - KnownOne2 = KnownOne2.trunc(BitWidth); - KnownZero2 = KnownZero2.trunc(BitWidth); + Known2 = Known2.trunc(BitWidth); } // Known bits are the values that are shared by every element. // TODO: support per-element known bits. - KnownOne &= KnownOne2; - KnownZero &= KnownZero2; + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; } return false; // Don't fall through, will infinitely loop. case ISD::AND: @@ -553,18 +574,18 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, // using the bits from the RHS. Below, we use knowledge about the RHS to // simplify the LHS, here we're using information from the LHS to simplify // the RHS. - if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + if (ConstantSDNode *RHSC = isConstOrConstSplat(Op.getOperand(1))) { SDValue Op0 = Op.getOperand(0); - APInt LHSZero, LHSOne; + KnownBits LHSKnown; // Do not increment Depth here; that can cause an infinite loop. - TLO.DAG.computeKnownBits(Op0, LHSZero, LHSOne, Depth); + TLO.DAG.computeKnownBits(Op0, LHSKnown, Depth); // If the LHS already has zeros where RHSC does, this and is dead. - if ((LHSZero & NewMask) == (~RHSC->getAPIntValue() & NewMask)) + if ((LHSKnown.Zero & NewMask) == (~RHSC->getAPIntValue() & NewMask)) return TLO.CombineTo(Op, Op0); // If any of the set bits in the RHS are known zero on the LHS, shrink // the constant. - if (TLO.ShrinkDemandedConstant(Op, ~LHSZero & NewMask)) + if (ShrinkDemandedConstant(Op, ~LHSKnown.Zero & NewMask, TLO)) return true; // Bitwise-not (xor X, -1) is a special case: we don't usually shrink its @@ -573,183 +594,191 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, // the xor. For example, for a 32-bit X: // and (xor (srl X, 31), -1), 1 --> xor (srl X, 31), 1 if (isBitwiseNot(Op0) && Op0.hasOneUse() && - LHSOne == ~RHSC->getAPIntValue()) { + LHSKnown.One == ~RHSC->getAPIntValue()) { SDValue Xor = TLO.DAG.getNode(ISD::XOR, dl, Op.getValueType(), Op0.getOperand(0), Op.getOperand(1)); return TLO.CombineTo(Op, Xor); } } - if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero, - KnownOne, TLO, Depth+1)) + if (SimplifyDemandedBits(Op.getOperand(1), NewMask, Known, TLO, Depth+1)) return true; - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); - if (SimplifyDemandedBits(Op.getOperand(0), ~KnownZero & NewMask, - KnownZero2, KnownOne2, TLO, Depth+1)) + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + if (SimplifyDemandedBits(Op.getOperand(0), ~Known.Zero & NewMask, + Known2, TLO, Depth+1)) return true; - assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); + assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); // If all of the demanded bits are known one on one side, return the other. // These bits cannot contribute to the result of the 'and'. - if ((NewMask & ~KnownZero2 & KnownOne) == (~KnownZero2 & NewMask)) + if (NewMask.isSubsetOf(Known2.Zero | Known.One)) return TLO.CombineTo(Op, Op.getOperand(0)); - if ((NewMask & ~KnownZero & KnownOne2) == (~KnownZero & NewMask)) + if (NewMask.isSubsetOf(Known.Zero | Known2.One)) return TLO.CombineTo(Op, Op.getOperand(1)); // If all of the demanded bits in the inputs are known zeros, return zero. - if ((NewMask & (KnownZero|KnownZero2)) == NewMask) + if (NewMask.isSubsetOf(Known.Zero | Known2.Zero)) return TLO.CombineTo(Op, TLO.DAG.getConstant(0, dl, Op.getValueType())); // If the RHS is a constant, see if we can simplify it. - if (TLO.ShrinkDemandedConstant(Op, ~KnownZero2 & NewMask)) + if (ShrinkDemandedConstant(Op, ~Known2.Zero & NewMask, TLO)) return true; // If the operation can be done in a smaller type, do so. - if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl)) + if (ShrinkDemandedOp(Op, BitWidth, NewMask, TLO)) return true; // Output known-1 bits are only known if set in both the LHS & RHS. - KnownOne &= KnownOne2; + Known.One &= Known2.One; // Output known-0 are known to be clear if zero in either the LHS | RHS. - KnownZero |= KnownZero2; + Known.Zero |= Known2.Zero; break; case ISD::OR: - if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero, - KnownOne, TLO, Depth+1)) + if (SimplifyDemandedBits(Op.getOperand(1), NewMask, Known, TLO, Depth+1)) return true; - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); - if (SimplifyDemandedBits(Op.getOperand(0), ~KnownOne & NewMask, - KnownZero2, KnownOne2, TLO, Depth+1)) + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + if (SimplifyDemandedBits(Op.getOperand(0), ~Known.One & NewMask, + Known2, TLO, Depth+1)) return true; - assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); + assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); // If all of the demanded bits are known zero on one side, return the other. // These bits cannot contribute to the result of the 'or'. - if ((NewMask & ~KnownOne2 & KnownZero) == (~KnownOne2 & NewMask)) + if (NewMask.isSubsetOf(Known2.One | Known.Zero)) return TLO.CombineTo(Op, Op.getOperand(0)); - if ((NewMask & ~KnownOne & KnownZero2) == (~KnownOne & NewMask)) - return TLO.CombineTo(Op, Op.getOperand(1)); - // If all of the potentially set bits on one side are known to be set on - // the other side, just use the 'other' side. - if ((NewMask & ~KnownZero & KnownOne2) == (~KnownZero & NewMask)) - return TLO.CombineTo(Op, Op.getOperand(0)); - if ((NewMask & ~KnownZero2 & KnownOne) == (~KnownZero2 & NewMask)) + if (NewMask.isSubsetOf(Known.One | Known2.Zero)) return TLO.CombineTo(Op, Op.getOperand(1)); // If the RHS is a constant, see if we can simplify it. - if (TLO.ShrinkDemandedConstant(Op, NewMask)) + if (ShrinkDemandedConstant(Op, NewMask, TLO)) return true; // If the operation can be done in a smaller type, do so. - if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl)) + if (ShrinkDemandedOp(Op, BitWidth, NewMask, TLO)) return true; // Output known-0 bits are only known if clear in both the LHS & RHS. - KnownZero &= KnownZero2; + Known.Zero &= Known2.Zero; // Output known-1 are known to be set if set in either the LHS | RHS. - KnownOne |= KnownOne2; + Known.One |= Known2.One; break; - case ISD::XOR: - if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero, - KnownOne, TLO, Depth+1)) + case ISD::XOR: { + if (SimplifyDemandedBits(Op.getOperand(1), NewMask, Known, TLO, Depth+1)) return true; - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); - if (SimplifyDemandedBits(Op.getOperand(0), NewMask, KnownZero2, - KnownOne2, TLO, Depth+1)) + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + if (SimplifyDemandedBits(Op.getOperand(0), NewMask, Known2, TLO, Depth+1)) return true; - assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); + assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); // If all of the demanded bits are known zero on one side, return the other. // These bits cannot contribute to the result of the 'xor'. - if ((KnownZero & NewMask) == NewMask) + if (NewMask.isSubsetOf(Known.Zero)) return TLO.CombineTo(Op, Op.getOperand(0)); - if ((KnownZero2 & NewMask) == NewMask) + if (NewMask.isSubsetOf(Known2.Zero)) return TLO.CombineTo(Op, Op.getOperand(1)); // If the operation can be done in a smaller type, do so. - if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl)) + if (ShrinkDemandedOp(Op, BitWidth, NewMask, TLO)) return true; // If all of the unknown bits are known to be zero on one side or the other // (but not both) turn this into an *inclusive* or. // e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0 - if ((NewMask & ~KnownZero & ~KnownZero2) == 0) + if ((NewMask & ~Known.Zero & ~Known2.Zero) == 0) return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::OR, dl, Op.getValueType(), Op.getOperand(0), Op.getOperand(1))); // Output known-0 bits are known if clear or set in both the LHS & RHS. - KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2); + KnownOut.Zero = (Known.Zero & Known2.Zero) | (Known.One & Known2.One); // Output known-1 are known to be set if set in only one of the LHS, RHS. - KnownOneOut = (KnownZero & KnownOne2) | (KnownOne & KnownZero2); + KnownOut.One = (Known.Zero & Known2.One) | (Known.One & Known2.Zero); // If all of the demanded bits on one side are known, and all of the set // bits on that side are also known to be set on the other side, turn this // into an AND, as we know the bits will be cleared. // e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2 // NB: it is okay if more bits are known than are requested - if ((NewMask & (KnownZero|KnownOne)) == NewMask) { // all known on one side - if (KnownOne == KnownOne2) { // set bits are the same on both sides + if (NewMask.isSubsetOf(Known.Zero|Known.One)) { // all known on one side + if (Known.One == Known2.One) { // set bits are the same on both sides EVT VT = Op.getValueType(); - SDValue ANDC = TLO.DAG.getConstant(~KnownOne & NewMask, dl, VT); + SDValue ANDC = TLO.DAG.getConstant(~Known.One & NewMask, dl, VT); return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), ANDC)); } } - // If the RHS is a constant, see if we can simplify it. - // for XOR, we prefer to force bits to 1 if they will make a -1. - // If we can't force bits, try to shrink the constant. - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { - APInt Expanded = C->getAPIntValue() | (~NewMask); - // If we can expand it to have all bits set, do it. - if (Expanded.isAllOnesValue()) { - if (Expanded != C->getAPIntValue()) { - EVT VT = Op.getValueType(); - SDValue New = TLO.DAG.getNode(Op.getOpcode(), dl,VT, Op.getOperand(0), - TLO.DAG.getConstant(Expanded, dl, VT)); - return TLO.CombineTo(Op, New); - } - // If it already has all the bits set, nothing to change - // but don't shrink either! - } else if (TLO.ShrinkDemandedConstant(Op, NewMask)) { - return true; + // If the RHS is a constant, see if we can change it. Don't alter a -1 + // constant because that's a 'not' op, and that is better for combining and + // codegen. + ConstantSDNode *C = isConstOrConstSplat(Op.getOperand(1)); + if (C && !C->isAllOnesValue()) { + if (NewMask.isSubsetOf(C->getAPIntValue())) { + // We're flipping all demanded bits. Flip the undemanded bits too. + SDValue New = TLO.DAG.getNOT(dl, Op.getOperand(0), Op.getValueType()); + return TLO.CombineTo(Op, New); } + // If we can't turn this into a 'not', try to shrink the constant. + if (ShrinkDemandedConstant(Op, NewMask, TLO)) + return true; } - KnownZero = KnownZeroOut; - KnownOne = KnownOneOut; + Known = std::move(KnownOut); break; + } case ISD::SELECT: - if (SimplifyDemandedBits(Op.getOperand(2), NewMask, KnownZero, - KnownOne, TLO, Depth+1)) + if (SimplifyDemandedBits(Op.getOperand(2), NewMask, Known, TLO, Depth+1)) return true; - if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero2, - KnownOne2, TLO, Depth+1)) + if (SimplifyDemandedBits(Op.getOperand(1), NewMask, Known2, TLO, Depth+1)) return true; - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); - assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); // If the operands are constants, see if we can simplify them. - if (TLO.ShrinkDemandedConstant(Op, NewMask)) + if (ShrinkDemandedConstant(Op, NewMask, TLO)) return true; // Only known if known in both the LHS and RHS. - KnownOne &= KnownOne2; - KnownZero &= KnownZero2; + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; break; case ISD::SELECT_CC: - if (SimplifyDemandedBits(Op.getOperand(3), NewMask, KnownZero, - KnownOne, TLO, Depth+1)) + if (SimplifyDemandedBits(Op.getOperand(3), NewMask, Known, TLO, Depth+1)) return true; - if (SimplifyDemandedBits(Op.getOperand(2), NewMask, KnownZero2, - KnownOne2, TLO, Depth+1)) + if (SimplifyDemandedBits(Op.getOperand(2), NewMask, Known2, TLO, Depth+1)) return true; - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); - assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); // If the operands are constants, see if we can simplify them. - if (TLO.ShrinkDemandedConstant(Op, NewMask)) + if (ShrinkDemandedConstant(Op, NewMask, TLO)) return true; // Only known if known in both the LHS and RHS. - KnownOne &= KnownOne2; - KnownZero &= KnownZero2; + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; break; + case ISD::SETCC: { + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); + // If (1) we only need the sign-bit, (2) the setcc operands are the same + // width as the setcc result, and (3) the result of a setcc conforms to 0 or + // -1, we may be able to bypass the setcc. + if (NewMask.isSignMask() && Op0.getScalarValueSizeInBits() == BitWidth && + getBooleanContents(Op.getValueType()) == + BooleanContent::ZeroOrNegativeOneBooleanContent) { + // If we're testing X < 0, then this compare isn't needed - just use X! + // FIXME: We're limiting to integer types here, but this should also work + // if we don't care about FP signed-zero. The use of SETLT with FP means + // that we don't care about NaNs. + if (CC == ISD::SETLT && Op1.getValueType().isInteger() && + (isNullConstant(Op1) || ISD::isBuildVectorAllZeros(Op1.getNode()))) + return TLO.CombineTo(Op, Op0); + + // TODO: Should we check for other forms of sign-bit comparisons? + // Examples: X <= -1, X >= 0 + } + if (getBooleanContents(Op0.getValueType()) == + TargetLowering::ZeroOrOneBooleanContent && + BitWidth > 1) + Known.Zero.setBitsFrom(1); + break; + } case ISD::SHL: if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { unsigned ShAmt = SA->getZExtValue(); @@ -781,17 +810,16 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, } } - if (SimplifyDemandedBits(InOp, NewMask.lshr(ShAmt), - KnownZero, KnownOne, TLO, Depth+1)) + if (SimplifyDemandedBits(InOp, NewMask.lshr(ShAmt), Known, TLO, Depth+1)) return true; // Convert (shl (anyext x, c)) to (anyext (shl x, c)) if the high bits // are not demanded. This will likely allow the anyext to be folded away. if (InOp.getNode()->getOpcode() == ISD::ANY_EXTEND) { - SDValue InnerOp = InOp.getNode()->getOperand(0); + SDValue InnerOp = InOp.getOperand(0); EVT InnerVT = InnerOp.getValueType(); unsigned InnerBits = InnerVT.getSizeInBits(); - if (ShAmt < InnerBits && NewMask.lshr(InnerBits) == 0 && + if (ShAmt < InnerBits && NewMask.getActiveBits() <= InnerBits && isTypeDesirableForOp(ISD::SHL, InnerVT)) { EVT ShTy = getShiftAmountTy(InnerVT, DL); if (!APInt(BitWidth, ShAmt).isIntN(ShTy.getSizeInBits())) @@ -813,12 +841,12 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, InnerOp.getOpcode() == ISD::SRL && InnerOp.hasOneUse() && isa<ConstantSDNode>(InnerOp.getOperand(1))) { - uint64_t InnerShAmt = cast<ConstantSDNode>(InnerOp.getOperand(1)) + unsigned InnerShAmt = cast<ConstantSDNode>(InnerOp.getOperand(1)) ->getZExtValue(); if (InnerShAmt < ShAmt && InnerShAmt < InnerBits && - NewMask.lshr(InnerBits - InnerShAmt + ShAmt) == 0 && - NewMask.trunc(ShAmt) == 0) { + NewMask.getActiveBits() <= (InnerBits - InnerShAmt + ShAmt) && + NewMask.countTrailingZeros() >= ShAmt) { SDValue NewSA = TLO.DAG.getConstant(ShAmt - InnerShAmt, dl, Op.getOperand(1).getValueType()); @@ -831,10 +859,10 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, } } - KnownZero <<= SA->getZExtValue(); - KnownOne <<= SA->getZExtValue(); + Known.Zero <<= SA->getZExtValue(); + Known.One <<= SA->getZExtValue(); // low bits known zero. - KnownZero |= APInt::getLowBitsSet(BitWidth, SA->getZExtValue()); + Known.Zero.setLowBits(SA->getZExtValue()); } break; case ISD::SRL: @@ -852,8 +880,8 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, // If the shift is exact, then it does demand the low bits (and knows that // they are zero). - if (cast<BinaryWithFlagsSDNode>(Op)->Flags.hasExact()) - InDemandedMask |= APInt::getLowBitsSet(BitWidth, ShAmt); + if (Op->getFlags().hasExact()) + InDemandedMask.setLowBits(ShAmt); // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a // single shift. We can do this if the top bits (which are shifted out) @@ -877,15 +905,13 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, } // Compute the new bits that are at the top now. - if (SimplifyDemandedBits(InOp, InDemandedMask, - KnownZero, KnownOne, TLO, Depth+1)) + if (SimplifyDemandedBits(InOp, InDemandedMask, Known, TLO, Depth+1)) return true; - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); - KnownZero = KnownZero.lshr(ShAmt); - KnownOne = KnownOne.lshr(ShAmt); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + Known.Zero.lshrInPlace(ShAmt); + Known.One.lshrInPlace(ShAmt); - APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt); - KnownZero |= HighBits; // High bits known zero. + Known.Zero.setHighBits(ShAmt); // High bits known zero. } break; case ISD::SRA: @@ -893,7 +919,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, // always convert this into a logical shr, even if the shift amount is // variable. The low bit of the shift cannot be an input sign bit unless // the shift amount is >= the size of the datatype, which is undefined. - if (NewMask == 1) + if (NewMask.isOneValue()) return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, Op.getValueType(), Op.getOperand(0), Op.getOperand(1))); @@ -910,33 +936,30 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, // If the shift is exact, then it does demand the low bits (and knows that // they are zero). - if (cast<BinaryWithFlagsSDNode>(Op)->Flags.hasExact()) - InDemandedMask |= APInt::getLowBitsSet(BitWidth, ShAmt); + if (Op->getFlags().hasExact()) + InDemandedMask.setLowBits(ShAmt); // If any of the demanded bits are produced by the sign extension, we also // demand the input sign bit. - APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt); - if (HighBits.intersects(NewMask)) - InDemandedMask |= APInt::getSignBit(VT.getScalarSizeInBits()); + if (NewMask.countLeadingZeros() < ShAmt) + InDemandedMask.setSignBit(); - if (SimplifyDemandedBits(Op.getOperand(0), InDemandedMask, - KnownZero, KnownOne, TLO, Depth+1)) + if (SimplifyDemandedBits(Op.getOperand(0), InDemandedMask, Known, TLO, + Depth+1)) return true; - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); - KnownZero = KnownZero.lshr(ShAmt); - KnownOne = KnownOne.lshr(ShAmt); - - // Handle the sign bit, adjusted to where it is now in the mask. - APInt SignBit = APInt::getSignBit(BitWidth).lshr(ShAmt); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + Known.Zero.lshrInPlace(ShAmt); + Known.One.lshrInPlace(ShAmt); // If the input sign bit is known to be zero, or if none of the top bits // are demanded, turn this into an unsigned shift right. - if (KnownZero.intersects(SignBit) || (HighBits & ~NewMask) == HighBits) { + if (Known.Zero[BitWidth - ShAmt - 1] || + NewMask.countLeadingZeros() >= ShAmt) { SDNodeFlags Flags; - Flags.setExact(cast<BinaryWithFlagsSDNode>(Op)->Flags.hasExact()); + Flags.setExact(Op->getFlags().hasExact()); return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op.getOperand(0), - Op.getOperand(1), &Flags)); + Op.getOperand(1), Flags)); } int Log2 = NewMask.exactLogBase2(); @@ -949,9 +972,9 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, Op.getOperand(0), NewSA)); } - if (KnownOne.intersects(SignBit)) + if (Known.One[BitWidth - ShAmt - 1]) // New bits are known one. - KnownOne |= HighBits; + Known.One.setHighBits(ShAmt); } break; case ISD::SIGN_EXTEND_INREG: { @@ -993,7 +1016,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, return TLO.CombineTo(Op, Op.getOperand(0)); APInt InSignBit = - APInt::getSignBit(ExVT.getScalarSizeInBits()).zext(BitWidth); + APInt::getSignMask(ExVT.getScalarSizeInBits()).zext(BitWidth); APInt InputDemandedBits = APInt::getLowBitsSet(BitWidth, ExVT.getScalarSizeInBits()) & @@ -1004,24 +1027,24 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, InputDemandedBits |= InSignBit; if (SimplifyDemandedBits(Op.getOperand(0), InputDemandedBits, - KnownZero, KnownOne, TLO, Depth+1)) + Known, TLO, Depth+1)) return true; - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); // If the sign bit of the input is known set or clear, then we know the // top bits of the result. // If the input sign bit is known zero, convert this into a zero extension. - if (KnownZero.intersects(InSignBit)) + if (Known.Zero.intersects(InSignBit)) return TLO.CombineTo(Op, TLO.DAG.getZeroExtendInReg( Op.getOperand(0), dl, ExVT.getScalarType())); - if (KnownOne.intersects(InSignBit)) { // Input sign bit known set - KnownOne |= NewBits; - KnownZero &= ~NewBits; + if (Known.One.intersects(InSignBit)) { // Input sign bit known set + Known.One |= NewBits; + Known.Zero &= ~NewBits; } else { // Input sign bit unknown - KnownZero &= ~NewBits; - KnownOne &= ~NewBits; + Known.Zero &= ~NewBits; + Known.One &= ~NewBits; } break; } @@ -1032,22 +1055,19 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, APInt MaskLo = NewMask.getLoBits(HalfBitWidth).trunc(HalfBitWidth); APInt MaskHi = NewMask.getHiBits(HalfBitWidth).trunc(HalfBitWidth); - APInt KnownZeroLo, KnownOneLo; - APInt KnownZeroHi, KnownOneHi; + KnownBits KnownLo, KnownHi; - if (SimplifyDemandedBits(Op.getOperand(0), MaskLo, KnownZeroLo, - KnownOneLo, TLO, Depth + 1)) + if (SimplifyDemandedBits(Op.getOperand(0), MaskLo, KnownLo, TLO, Depth + 1)) return true; - if (SimplifyDemandedBits(Op.getOperand(1), MaskHi, KnownZeroHi, - KnownOneHi, TLO, Depth + 1)) + if (SimplifyDemandedBits(Op.getOperand(1), MaskHi, KnownHi, TLO, Depth + 1)) return true; - KnownZero = KnownZeroLo.zext(BitWidth) | - KnownZeroHi.zext(BitWidth).shl(HalfBitWidth); + Known.Zero = KnownLo.Zero.zext(BitWidth) | + KnownHi.Zero.zext(BitWidth).shl(HalfBitWidth); - KnownOne = KnownOneLo.zext(BitWidth) | - KnownOneHi.zext(BitWidth).shl(HalfBitWidth); + Known.One = KnownLo.One.zext(BitWidth) | + KnownHi.One.zext(BitWidth).shl(HalfBitWidth); break; } case ISD::ZERO_EXTEND: { @@ -1062,20 +1082,18 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, Op.getValueType(), Op.getOperand(0))); - if (SimplifyDemandedBits(Op.getOperand(0), InMask, - KnownZero, KnownOne, TLO, Depth+1)) + if (SimplifyDemandedBits(Op.getOperand(0), InMask, Known, TLO, Depth+1)) return true; - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); - KnownZero = KnownZero.zext(BitWidth); - KnownOne = KnownOne.zext(BitWidth); - KnownZero |= NewBits; + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + Known = Known.zext(BitWidth); + Known.Zero |= NewBits; break; } case ISD::SIGN_EXTEND: { EVT InVT = Op.getOperand(0).getValueType(); unsigned InBits = InVT.getScalarSizeInBits(); APInt InMask = APInt::getLowBitsSet(BitWidth, InBits); - APInt InSignBit = APInt::getBitsSet(BitWidth, InBits - 1, InBits); + APInt InSignBit = APInt::getOneBitSet(BitWidth, InBits - 1); APInt NewBits = ~InMask & NewMask; // If none of the top bits are demanded, convert this into an any_extend. @@ -1090,37 +1108,34 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, InDemandedBits |= InSignBit; InDemandedBits = InDemandedBits.trunc(InBits); - if (SimplifyDemandedBits(Op.getOperand(0), InDemandedBits, KnownZero, - KnownOne, TLO, Depth+1)) + if (SimplifyDemandedBits(Op.getOperand(0), InDemandedBits, Known, TLO, + Depth+1)) return true; - KnownZero = KnownZero.zext(BitWidth); - KnownOne = KnownOne.zext(BitWidth); + Known = Known.zext(BitWidth); // If the sign bit is known zero, convert this to a zero extend. - if (KnownZero.intersects(InSignBit)) + if (Known.Zero.intersects(InSignBit)) return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, Op.getValueType(), Op.getOperand(0))); // If the sign bit is known one, the top bits match. - if (KnownOne.intersects(InSignBit)) { - KnownOne |= NewBits; - assert((KnownZero & NewBits) == 0); + if (Known.One.intersects(InSignBit)) { + Known.One |= NewBits; + assert((Known.Zero & NewBits) == 0); } else { // Otherwise, top bits aren't known. - assert((KnownOne & NewBits) == 0); - assert((KnownZero & NewBits) == 0); + assert((Known.One & NewBits) == 0); + assert((Known.Zero & NewBits) == 0); } break; } case ISD::ANY_EXTEND: { unsigned OperandBitWidth = Op.getOperand(0).getScalarValueSizeInBits(); APInt InMask = NewMask.trunc(OperandBitWidth); - if (SimplifyDemandedBits(Op.getOperand(0), InMask, - KnownZero, KnownOne, TLO, Depth+1)) + if (SimplifyDemandedBits(Op.getOperand(0), InMask, Known, TLO, Depth+1)) return true; - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); - KnownZero = KnownZero.zext(BitWidth); - KnownOne = KnownOne.zext(BitWidth); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + Known = Known.zext(BitWidth); break; } case ISD::TRUNCATE: { @@ -1128,11 +1143,9 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, // zero/one bits live out. unsigned OperandBitWidth = Op.getOperand(0).getScalarValueSizeInBits(); APInt TruncMask = NewMask.zext(OperandBitWidth); - if (SimplifyDemandedBits(Op.getOperand(0), TruncMask, - KnownZero, KnownOne, TLO, Depth+1)) + if (SimplifyDemandedBits(Op.getOperand(0), TruncMask, Known, TLO, Depth+1)) return true; - KnownZero = KnownZero.trunc(BitWidth); - KnownOne = KnownOne.trunc(BitWidth); + Known = Known.trunc(BitWidth); // If the input is only used by this truncate, see if we can shrink it based // on the known demanded bits. @@ -1158,26 +1171,29 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, getShiftAmountTy(Op.getValueType(), DL)); } - APInt HighBits = APInt::getHighBitsSet(OperandBitWidth, - OperandBitWidth - BitWidth); - HighBits = HighBits.lshr(ShAmt->getZExtValue()).trunc(BitWidth); - - if (ShAmt->getZExtValue() < BitWidth && !(HighBits & NewMask)) { - // None of the shifted in bits are needed. Add a truncate of the - // shift input, then shift it. - SDValue NewTrunc = TLO.DAG.getNode(ISD::TRUNCATE, dl, - Op.getValueType(), - In.getOperand(0)); - return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, - Op.getValueType(), - NewTrunc, - Shift)); + if (ShAmt->getZExtValue() < BitWidth) { + APInt HighBits = APInt::getHighBitsSet(OperandBitWidth, + OperandBitWidth - BitWidth); + HighBits.lshrInPlace(ShAmt->getZExtValue()); + HighBits = HighBits.trunc(BitWidth); + + if (!(HighBits & NewMask)) { + // None of the shifted in bits are needed. Add a truncate of the + // shift input, then shift it. + SDValue NewTrunc = TLO.DAG.getNode(ISD::TRUNCATE, dl, + Op.getValueType(), + In.getOperand(0)); + return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, + Op.getValueType(), + NewTrunc, + Shift)); + } } break; } } - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); break; } case ISD::AssertZext: { @@ -1187,11 +1203,11 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits()); if (SimplifyDemandedBits(Op.getOperand(0), ~InMask | NewMask, - KnownZero, KnownOne, TLO, Depth+1)) + Known, TLO, Depth+1)) return true; - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); - KnownZero |= ~InMask & NewMask; + Known.Zero |= ~InMask; break; } case ISD::BITCAST: @@ -1200,7 +1216,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, if (!TLO.LegalOperations() && !Op.getValueType().isVector() && !Op.getOperand(0).getValueType().isVector() && - NewMask == APInt::getSignBit(Op.getValueSizeInBits()) && + NewMask == APInt::getSignMask(Op.getValueSizeInBits()) && Op.getOperand(0).getValueType().isFloatingPoint()) { bool OpVTLegal = isOperationLegalOrCustom(ISD::FGETSIGN, Op.getValueType()); bool i32Legal = isOperationLegalOrCustom(ISD::FGETSIGN, MVT::i32); @@ -1229,22 +1245,19 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, // of the highest bit demanded of them. APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - NewMask.countLeadingZeros()); - if (SimplifyDemandedBits(Op.getOperand(0), LoMask, KnownZero2, - KnownOne2, TLO, Depth+1) || - SimplifyDemandedBits(Op.getOperand(1), LoMask, KnownZero2, - KnownOne2, TLO, Depth+1) || + if (SimplifyDemandedBits(Op.getOperand(0), LoMask, Known2, TLO, Depth+1) || + SimplifyDemandedBits(Op.getOperand(1), LoMask, Known2, TLO, Depth+1) || // See if the operation should be performed at a smaller bit width. - TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl)) { - const SDNodeFlags *Flags = Op.getNode()->getFlags(); - if (Flags->hasNoSignedWrap() || Flags->hasNoUnsignedWrap()) { + ShrinkDemandedOp(Op, BitWidth, NewMask, TLO)) { + SDNodeFlags Flags = Op.getNode()->getFlags(); + if (Flags.hasNoSignedWrap() || Flags.hasNoUnsignedWrap()) { // Disable the nsw and nuw flags. We can no longer guarantee that we // won't wrap after simplification. - SDNodeFlags NewFlags = *Flags; - NewFlags.setNoSignedWrap(false); - NewFlags.setNoUnsignedWrap(false); + Flags.setNoSignedWrap(false); + Flags.setNoUnsignedWrap(false); SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, Op.getValueType(), Op.getOperand(0), Op.getOperand(1), - &NewFlags); + Flags); return TLO.CombineTo(Op, NewOp); } return true; @@ -1253,13 +1266,13 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, } default: // Just use computeKnownBits to compute output bits. - TLO.DAG.computeKnownBits(Op, KnownZero, KnownOne, Depth); + TLO.DAG.computeKnownBits(Op, Known, Depth); break; } // If we know the value of all of the demanded bits, return this as a // constant. - if ((NewMask & (KnownZero|KnownOne)) == NewMask) { + if (NewMask.isSubsetOf(Known.Zero|Known.One)) { // Avoid folding to a constant if any OpaqueConstant is involved. const SDNode *N = Op.getNode(); for (SDNodeIterator I = SDNodeIterator::begin(N), @@ -1270,17 +1283,17 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, return false; } return TLO.CombineTo(Op, - TLO.DAG.getConstant(KnownOne, dl, Op.getValueType())); + TLO.DAG.getConstant(Known.One, dl, Op.getValueType())); } return false; } /// Determine which of the bits specified in Mask are known to be either zero or -/// one and return them in the KnownZero/KnownOne bitsets. +/// one and return them in the Known. void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, - APInt &KnownZero, - APInt &KnownOne, + KnownBits &Known, + const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { assert((Op.getOpcode() >= ISD::BUILTIN_OP_END || @@ -1289,12 +1302,13 @@ void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Op.getOpcode() == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!"); - KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); + Known.resetAll(); } /// This method can be implemented by targets that want to expose additional /// information about sign bits to the DAG Combiner. unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, + const APInt &, const SelectionDAG &, unsigned Depth) const { assert((Op.getOpcode() >= ISD::BUILTIN_OP_END || @@ -1306,31 +1320,38 @@ unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, return 1; } +// FIXME: Ideally, this would use ISD::isConstantSplatVector(), but that must +// work with truncating build vectors and vectors with elements of less than +// 8 bits. bool TargetLowering::isConstTrueVal(const SDNode *N) const { if (!N) return false; - const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N); - if (!CN) { - const BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N); - if (!BV) - return false; - - // Only interested in constant splats, we don't care about undef - // elements in identifying boolean constants and getConstantSplatNode - // returns NULL if all ops are undef; - CN = BV->getConstantSplatNode(); + APInt CVal; + if (auto *CN = dyn_cast<ConstantSDNode>(N)) { + CVal = CN->getAPIntValue(); + } else if (auto *BV = dyn_cast<BuildVectorSDNode>(N)) { + auto *CN = BV->getConstantSplatNode(); if (!CN) return false; + + // If this is a truncating build vector, truncate the splat value. + // Otherwise, we may fail to match the expected values below. + unsigned BVEltWidth = BV->getValueType(0).getScalarSizeInBits(); + CVal = CN->getAPIntValue(); + if (BVEltWidth < CVal.getBitWidth()) + CVal = CVal.trunc(BVEltWidth); + } else { + return false; } switch (getBooleanContents(N->getValueType(0))) { case UndefinedBooleanContent: - return CN->getAPIntValue()[0]; + return CVal[0]; case ZeroOrOneBooleanContent: - return CN->isOne(); + return CVal.isOneValue(); case ZeroOrNegativeOneBooleanContent: - return CN->isAllOnesValue(); + return CVal.isAllOnesValue(); } llvm_unreachable("Invalid boolean contents"); @@ -1472,8 +1493,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, } } - // Ensure that the constant occurs on the RHS, and fold constant - // comparisons. + // Ensure that the constant occurs on the RHS and fold constant comparisons. ISD::CondCode SwappedCC = ISD::getSetCCSwappedOperands(Cond); if (isa<ConstantSDNode>(N0.getNode()) && (DCI.isBeforeLegalizeOps() || @@ -1486,7 +1506,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, // If the LHS is '(srl (ctlz x), 5)', the RHS is 0/1, and this is an // equality comparison, then we're just comparing whether X itself is // zero. - if (N0.getOpcode() == ISD::SRL && (C1 == 0 || C1 == 1) && + if (N0.getOpcode() == ISD::SRL && (C1.isNullValue() || C1.isOneValue()) && N0.getOperand(0).getOpcode() == ISD::CTLZ && N0.getOperand(1).getOpcode() == ISD::Constant) { const APInt &ShAmt @@ -1617,14 +1637,13 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, return DAG.getSetCC(dl, VT, TopSetCC.getOperand(0), TopSetCC.getOperand(1), InvCond); - } } } - // If the LHS is '(and load, const)', the RHS is 0, - // the test is for equality or unsigned, and all 1 bits of the const are - // in the same partial word, see if we can shorten the load. + // If the LHS is '(and load, const)', the RHS is 0, the test is for + // equality or unsigned, and all 1 bits of the const are in the same + // partial word, see if we can shorten the load. if (DCI.isBeforeLegalize() && !ISD::isSignedIntSetCC(Cond) && N0.getOpcode() == ISD::AND && C1 == 0 && @@ -1647,16 +1666,16 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, for (unsigned width = origWidth / 2; width>=8; width /= 2) { APInt newMask = APInt::getLowBitsSet(maskWidth, width); for (unsigned offset=0; offset<origWidth/width; offset++) { - if ((newMask & Mask) == Mask) { - if (!DAG.getDataLayout().isLittleEndian()) - bestOffset = (origWidth/width - offset - 1) * (width/8); - else + if (Mask.isSubsetOf(newMask)) { + if (DAG.getDataLayout().isLittleEndian()) bestOffset = (uint64_t)offset * (width/8); + else + bestOffset = (origWidth/width - offset - 1) * (width/8); bestMask = Mask.lshr(offset * (width/8) * 8); bestWidth = width; break; } - newMask = newMask << width; + newMask <<= width; } } } @@ -1692,10 +1711,12 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, switch (Cond) { case ISD::SETUGT: case ISD::SETUGE: - case ISD::SETEQ: return DAG.getConstant(0, dl, VT); + case ISD::SETEQ: + return DAG.getConstant(0, dl, VT); case ISD::SETULT: case ISD::SETULE: - case ISD::SETNE: return DAG.getConstant(1, dl, VT); + case ISD::SETNE: + return DAG.getConstant(1, dl, VT); case ISD::SETGT: case ISD::SETGE: // True if the sign bit of C1 is set. @@ -1764,12 +1785,12 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ExtSrcTyBits), dl, ExtDstTy), Cond); - } else if ((N1C->isNullValue() || N1C->getAPIntValue() == 1) && + } else if ((N1C->isNullValue() || N1C->isOne()) && (Cond == ISD::SETEQ || Cond == ISD::SETNE)) { // SETCC (SETCC), [0|1], [EQ|NE] -> SETCC if (N0.getOpcode() == ISD::SETCC && isTypeLegal(VT) && VT.bitsLE(N0.getValueType())) { - bool TrueWhenTrue = (Cond == ISD::SETEQ) ^ (N1C->getAPIntValue() != 1); + bool TrueWhenTrue = (Cond == ISD::SETEQ) ^ (!N1C->isOne()); if (TrueWhenTrue) return DAG.getNode(ISD::TRUNCATE, dl, VT, N0); // Invert the condition. @@ -1786,7 +1807,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, N0.getOperand(0).getOpcode() == ISD::XOR && N0.getOperand(1) == N0.getOperand(0).getOperand(1))) && isa<ConstantSDNode>(N0.getOperand(1)) && - cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue() == 1) { + cast<ConstantSDNode>(N0.getOperand(1))->isOne()) { // If this is (X^1) == 0/1, swap the RHS and eliminate the xor. We // can only do this if the top bits are known zero. unsigned BitWidth = N0.getValueSizeInBits(); @@ -1795,9 +1816,9 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, BitWidth-1))) { // Okay, get the un-inverted input value. SDValue Val; - if (N0.getOpcode() == ISD::XOR) + if (N0.getOpcode() == ISD::XOR) { Val = N0.getOperand(0); - else { + } else { assert(N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::XOR); // ((X^1)&1)^1 -> X & 1 @@ -1809,7 +1830,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, return DAG.getSetCC(dl, VT, Val, N1, Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ); } - } else if (N1C->getAPIntValue() == 1 && + } else if (N1C->isOne() && (VT == MVT::i1 || getBooleanContents(N0->getValueType(0)) == ZeroOrOneBooleanContent)) { @@ -1827,7 +1848,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, } if (Op0.getOpcode() == ISD::AND && isa<ConstantSDNode>(Op0.getOperand(1)) && - cast<ConstantSDNode>(Op0.getOperand(1))->getAPIntValue() == 1) { + cast<ConstantSDNode>(Op0.getOperand(1))->isOne()) { // If this is (X&1) == / != 1, normalize it to (X&1) != / == 0. if (Op0.getValueType().bitsGT(VT)) Op0 = DAG.getNode(ISD::AND, dl, VT, @@ -1862,7 +1883,10 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, // Canonicalize GE/LE comparisons to use GT/LT comparisons. if (Cond == ISD::SETGE || Cond == ISD::SETUGE) { - if (C1 == MinVal) return DAG.getConstant(1, dl, VT); // X >= MIN --> true + // X >= MIN --> true + if (C1 == MinVal) + return DAG.getConstant(1, dl, VT); + // X >= C0 --> X > (C0 - 1) APInt C = C1 - 1; ISD::CondCode NewCC = (Cond == ISD::SETGE) ? ISD::SETGT : ISD::SETUGT; @@ -1877,7 +1901,10 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, } if (Cond == ISD::SETLE || Cond == ISD::SETULE) { - if (C1 == MaxVal) return DAG.getConstant(1, dl, VT); // X <= MAX --> true + // X <= MAX --> true + if (C1 == MaxVal) + return DAG.getConstant(1, dl, VT); + // X <= C0 --> X < (C0 + 1) APInt C = C1 + 1; ISD::CondCode NewCC = (Cond == ISD::SETLE) ? ISD::SETLT : ISD::SETULT; @@ -2006,7 +2033,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, } else { ShiftBits = C1.countTrailingZeros(); } - NewC = NewC.lshr(ShiftBits); + NewC.lshrInPlace(ShiftBits); if (ShiftBits && NewC.getMinSignedBits() <= 64 && isLegalICmpImmediate(NewC.getSExtValue())) { auto &DL = DAG.getDataLayout(); @@ -2050,6 +2077,16 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, if (Cond == ISD::SETO || Cond == ISD::SETUO) return DAG.getSetCC(dl, VT, N0, N0, Cond); + // setcc (fneg x), C -> setcc swap(pred) x, -C + if (N0.getOpcode() == ISD::FNEG) { + ISD::CondCode SwapCond = ISD::getSetCCSwappedOperands(Cond); + if (DCI.isBeforeLegalizeOps() || + isCondCodeLegal(SwapCond, N0.getSimpleValueType())) { + SDValue NegN1 = DAG.getNode(ISD::FNEG, dl, N0.getValueType(), N1); + return DAG.getSetCC(dl, VT, N0.getOperand(0), NegN1, SwapCond); + } + } + // If the condition is not legal, see if we can find an equivalent one // which is legal. if (!isCondCodeLegal(Cond, N0.getSimpleValueType())) { @@ -2129,7 +2166,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, return DAG.getSetCC(dl, VT, N0.getOperand(1), N1.getOperand(1), Cond); if (N0.getOperand(1) == N1.getOperand(1)) return DAG.getSetCC(dl, VT, N0.getOperand(0), N1.getOperand(0), Cond); - if (DAG.isCommutativeBinOp(N0.getOpcode())) { + if (isCommutativeBinOp(N0.getOpcode())) { // If X op Y == Y op X, try other combinations. if (N0.getOperand(0) == N1.getOperand(1)) return DAG.getSetCC(dl, VT, N0.getOperand(1), N1.getOperand(0), @@ -2193,7 +2230,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, return DAG.getSetCC(dl, VT, N0.getOperand(1), DAG.getConstant(0, dl, N0.getValueType()), Cond); if (N0.getOperand(1) == N1) { - if (DAG.isCommutativeBinOp(N0.getOpcode())) + if (isCommutativeBinOp(N0.getOpcode())) return DAG.getSetCC(dl, VT, N0.getOperand(0), DAG.getConstant(0, dl, N0.getValueType()), Cond); @@ -2220,7 +2257,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, return DAG.getSetCC(dl, VT, N1.getOperand(1), DAG.getConstant(0, dl, N1.getValueType()), Cond); if (N1.getOperand(1) == N0) { - if (DAG.isCommutativeBinOp(N1.getOpcode())) + if (isCommutativeBinOp(N1.getOpcode())) return DAG.getSetCC(dl, VT, N1.getOperand(0), DAG.getConstant(0, dl, N1.getValueType()), Cond); if (N1.getNode()->hasOneUse()) { @@ -2445,7 +2482,7 @@ void TargetLowering::LowerAsmOperandForConstraint(SDValue Op, // gcc prints these as sign extended. Sign extend value to 64 bits // now; without this it would get ZExt'd later in // ScheduleDAGSDNodes::EmitNode, which is very generic. - Ops.push_back(DAG.getTargetConstant(C->getAPIntValue().getSExtValue(), + Ops.push_back(DAG.getTargetConstant(C->getSExtValue(), SDLoc(C), MVT::i64)); } return; @@ -2470,13 +2507,10 @@ TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI, std::make_pair(0u, static_cast<const TargetRegisterClass*>(nullptr)); // Figure out which register class contains this reg. - for (TargetRegisterInfo::regclass_iterator RCI = RI->regclass_begin(), - E = RI->regclass_end(); RCI != E; ++RCI) { - const TargetRegisterClass *RC = *RCI; - + for (const TargetRegisterClass *RC : RI->regclasses()) { // If none of the value types for this register class are valid, we // can't use it. For example, 64-bit reg classes on 32-bit targets. - if (!isLegalRC(RC)) + if (!isLegalRC(*RI, *RC)) continue; for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); @@ -2488,9 +2522,9 @@ TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI, // If this register class has the requested value type, return it, // otherwise keep searching and return the first class found // if no other is found which explicitly has the requested type. - if (RC->hasType(VT)) + if (RI->isTypeLegalForClass(*RC, VT)) return S; - else if (!R.second) + if (!R.second) R = S; } } @@ -2914,9 +2948,9 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDValue Op1, APInt d, DAG.getDataLayout())); SDNodeFlags Flags; Flags.setExact(true); - Op1 = DAG.getNode(ISD::SRA, dl, Op1.getValueType(), Op1, Amt, &Flags); + Op1 = DAG.getNode(ISD::SRA, dl, Op1.getValueType(), Op1, Amt, Flags); Created.push_back(Op1.getNode()); - d = d.ashr(ShAmt); + d.ashrInPlace(ShAmt); } // Calculate the multiplicative inverse, using Newton's method. @@ -2933,7 +2967,7 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDValue Op1, APInt d, SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector<SDNode *> *Created) const { - AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.isIntDivCheap(N->getValueType(0), Attr)) return SDValue(N,0); // Lower SDIV as SDIV @@ -2958,7 +2992,7 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, const APInt &Divisor, return SDValue(); // If the sdiv has an 'exact' bit we can use a simpler lowering. - if (cast<BinaryWithFlagsSDNode>(N)->Flags.hasExact()) + if (N->getFlags().hasExact()) return BuildExactSDIV(*this, N->getOperand(0), Divisor, dl, DAG, *Created); APInt::ms magics = Divisor.magic(); @@ -3297,7 +3331,7 @@ bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result, SDValue ExponentMask = DAG.getConstant(0x7F800000, dl, IntVT); SDValue ExponentLoBit = DAG.getConstant(23, dl, IntVT); SDValue Bias = DAG.getConstant(127, dl, IntVT); - SDValue SignMask = DAG.getConstant(APInt::getSignBit(VT.getSizeInBits()), dl, + SDValue SignMask = DAG.getConstant(APInt::getSignMask(VT.getSizeInBits()), dl, IntVT); SDValue SignLowBit = DAG.getConstant(VT.getSizeInBits() - 1, dl, IntVT); SDValue MantissaMask = DAG.getConstant(0x007FFFFF, dl, IntVT); @@ -3808,7 +3842,7 @@ SDValue TargetLowering::LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()); - CLI.setCallee(CallingConv::C, VoidPtrType, EmuTlsGetAddr, std::move(Args)); + CLI.setLibCallee(CallingConv::C, VoidPtrType, EmuTlsGetAddr, std::move(Args)); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. diff --git a/contrib/llvm/lib/CodeGen/ShadowStackGCLowering.cpp b/contrib/llvm/lib/CodeGen/ShadowStackGCLowering.cpp index ff7d205..7b60d22 100644 --- a/contrib/llvm/lib/CodeGen/ShadowStackGCLowering.cpp +++ b/contrib/llvm/lib/CodeGen/ShadowStackGCLowering.cpp @@ -16,9 +16,9 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "llvm/ADT/StringExtras.h" #include "llvm/CodeGen/GCStrategy.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" @@ -27,7 +27,7 @@ using namespace llvm; -#define DEBUG_TYPE "shadowstackgclowering" +#define DEBUG_TYPE "shadow-stack-gc-lowering" namespace { @@ -66,10 +66,10 @@ private: }; } -INITIALIZE_PASS_BEGIN(ShadowStackGCLowering, "shadow-stack-gc-lowering", +INITIALIZE_PASS_BEGIN(ShadowStackGCLowering, DEBUG_TYPE, "Shadow Stack GC Lowering", false, false) INITIALIZE_PASS_DEPENDENCY(GCModuleInfo) -INITIALIZE_PASS_END(ShadowStackGCLowering, "shadow-stack-gc-lowering", +INITIALIZE_PASS_END(ShadowStackGCLowering, DEBUG_TYPE, "Shadow Stack GC Lowering", false, false) FunctionPass *llvm::createShadowStackGCLoweringPass() { return new ShadowStackGCLowering(); } diff --git a/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp b/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp index 4837495..aa75f5e 100644 --- a/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp +++ b/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp @@ -210,13 +210,12 @@ public: char ShrinkWrap::ID = 0; char &llvm::ShrinkWrapID = ShrinkWrap::ID; -INITIALIZE_PASS_BEGIN(ShrinkWrap, "shrink-wrap", "Shrink Wrap Pass", false, - false) +INITIALIZE_PASS_BEGIN(ShrinkWrap, DEBUG_TYPE, "Shrink Wrap Pass", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(ShrinkWrap, "shrink-wrap", "Shrink Wrap Pass", false, false) +INITIALIZE_PASS_END(ShrinkWrap, DEBUG_TYPE, "Shrink Wrap Pass", false, false) bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI, RegScavenger *RS) const { @@ -282,8 +281,14 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB, if (!Restore) Restore = &MBB; - else + else if (MPDT->getNode(&MBB)) // If the block is not in the post dom tree, it + // means the block never returns. If that's the + // case, we don't want to call + // `findNearestCommonDominator`, which will + // return `Restore`. Restore = MPDT->findNearestCommonDominator(Restore, &MBB); + else + Restore = nullptr; // Abort, we can't find a restore point in this case. // Make sure we would be able to insert the restore code before the // terminator. @@ -293,7 +298,7 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB, continue; // One of the terminator needs to happen before the restore point. if (MBB.succ_empty()) { - Restore = nullptr; + Restore = nullptr; // Abort, we can't find a restore point in this case. break; } // Look for a restore point that post-dominates all the successors. @@ -419,7 +424,7 @@ static bool isIrreducibleCFG(const MachineFunction &MF, } bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) { - if (MF.empty() || !isShrinkWrapEnabled(MF)) + if (skipFunction(*MF.getFunction()) || MF.empty() || !isShrinkWrapEnabled(MF)) return false; DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n'); diff --git a/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp b/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp index 209bbe5..17a3a84 100644 --- a/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp +++ b/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp @@ -12,11 +12,11 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" @@ -64,6 +64,7 @@ public: private: bool setupEntryBlockAndCallSites(Function &F); + bool undoSwiftErrorSelect(Function &F); void substituteLPadValues(LandingPadInst *LPI, Value *ExnVal, Value *SelVal); Value *setupFunctionContext(Function &F, ArrayRef<LandingPadInst *> LPads); void lowerIncomingArguments(Function &F); @@ -73,7 +74,7 @@ private: } // end anonymous namespace char SjLjEHPrepare::ID = 0; -INITIALIZE_PASS(SjLjEHPrepare, "sjljehprepare", "Prepare SjLj exceptions", +INITIALIZE_PASS(SjLjEHPrepare, DEBUG_TYPE, "Prepare SjLj exceptions", false, false) // Public Interface To the SjLjEHPrepare pass. @@ -92,8 +93,8 @@ bool SjLjEHPrepare::doInitialization(Module &M) { doubleUnderDataTy, // __data VoidPtrTy, // __personality VoidPtrTy, // __lsda - doubleUnderJBufTy, // __jbuf - nullptr); + doubleUnderJBufTy // __jbuf + ); return true; } @@ -124,8 +125,11 @@ static void MarkBlocksLiveIn(BasicBlock *BB, if (!LiveBBs.insert(BB).second) return; // already been here. - for (BasicBlock *PredBB : predecessors(BB)) - MarkBlocksLiveIn(PredBB, LiveBBs); + df_iterator_default_set<BasicBlock*> Visited; + + for (BasicBlock *B : inverse_depth_first_ext(BB, Visited)) + LiveBBs.insert(B); + } /// substituteLPadValues - Substitute the values returned by the landingpad @@ -174,8 +178,8 @@ Value *SjLjEHPrepare::setupFunctionContext(Function &F, // because the value needs to be added to the global context list. auto &DL = F.getParent()->getDataLayout(); unsigned Align = DL.getPrefTypeAlignment(FunctionContextTy); - FuncCtx = new AllocaInst(FunctionContextTy, nullptr, Align, "fn_context", - &EntryBB->front()); + FuncCtx = new AllocaInst(FunctionContextTy, DL.getAllocaAddrSpace(), + nullptr, Align, "fn_context", &EntryBB->front()); // Fill in the function context structure. for (LandingPadInst *LPI : LPads) { @@ -458,14 +462,33 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) { return true; } +bool SjLjEHPrepare::undoSwiftErrorSelect(Function &F) { + // We have inserted dummy copies 'select true, arg, undef' in the entry block + // for arguments to simplify this pass. + // swifterror arguments cannot be used in this way. Undo the select for the + // swifterror argument. + for (auto &AI : F.args()) { + if (AI.isSwiftError()) { + assert(AI.hasOneUse() && "Must have converted the argument to a select"); + auto *Select = dyn_cast<SelectInst>(AI.use_begin()->getUser()); + assert(Select && "There must be single select user"); + auto *OrigSwiftError = cast<Argument>(Select->getTrueValue()); + Select->replaceAllUsesWith(OrigSwiftError); + Select->eraseFromParent(); + return true; + } + } + return false; +} + bool SjLjEHPrepare::runOnFunction(Function &F) { Module &M = *F.getParent(); RegisterFn = M.getOrInsertFunction( "_Unwind_SjLj_Register", Type::getVoidTy(M.getContext()), - PointerType::getUnqual(FunctionContextTy), nullptr); + PointerType::getUnqual(FunctionContextTy)); UnregisterFn = M.getOrInsertFunction( "_Unwind_SjLj_Unregister", Type::getVoidTy(M.getContext()), - PointerType::getUnqual(FunctionContextTy), nullptr); + PointerType::getUnqual(FunctionContextTy)); FrameAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::frameaddress); StackAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave); StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore); @@ -476,5 +499,7 @@ bool SjLjEHPrepare::runOnFunction(Function &F) { FuncCtxFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_functioncontext); bool Res = setupEntryBlockAndCallSites(F); + if (Res) + Res |= undoSwiftErrorSelect(F); return Res; } diff --git a/contrib/llvm/lib/CodeGen/SlotIndexes.cpp b/contrib/llvm/lib/CodeGen/SlotIndexes.cpp index dba103e9..3656832 100644 --- a/contrib/llvm/lib/CodeGen/SlotIndexes.cpp +++ b/contrib/llvm/lib/CodeGen/SlotIndexes.cpp @@ -19,7 +19,7 @@ using namespace llvm; #define DEBUG_TYPE "slotindexes" char SlotIndexes::ID = 0; -INITIALIZE_PASS(SlotIndexes, "slotindexes", +INITIALIZE_PASS(SlotIndexes, DEBUG_TYPE, "Slot index numbering", false, false) STATISTIC(NumLocalRenum, "Number of local renumberings"); @@ -103,6 +103,48 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) { return false; } +void SlotIndexes::removeMachineInstrFromMaps(MachineInstr &MI) { + assert(!MI.isBundledWithPred() && + "Use removeSingleMachineInstrFromMaps() instread"); + Mi2IndexMap::iterator mi2iItr = mi2iMap.find(&MI); + if (mi2iItr == mi2iMap.end()) + return; + + SlotIndex MIIndex = mi2iItr->second; + IndexListEntry &MIEntry = *MIIndex.listEntry(); + assert(MIEntry.getInstr() == &MI && "Instruction indexes broken."); + mi2iMap.erase(mi2iItr); + // FIXME: Eventually we want to actually delete these indexes. + MIEntry.setInstr(nullptr); +} + +void SlotIndexes::removeSingleMachineInstrFromMaps(MachineInstr &MI) { + Mi2IndexMap::iterator mi2iItr = mi2iMap.find(&MI); + if (mi2iItr == mi2iMap.end()) + return; + + SlotIndex MIIndex = mi2iItr->second; + IndexListEntry &MIEntry = *MIIndex.listEntry(); + assert(MIEntry.getInstr() == &MI && "Instruction indexes broken."); + mi2iMap.erase(mi2iItr); + + // When removing the first instruction of a bundle update mapping to next + // instruction. + if (MI.isBundledWithSucc()) { + // Only the first instruction of a bundle should have an index assigned. + assert(!MI.isBundledWithPred() && "Should have first bundle isntruction"); + + MachineBasicBlock::instr_iterator Next = std::next(MI.getIterator()); + MachineInstr &NextMI = *Next; + MIEntry.setInstr(&NextMI); + mi2iMap.insert(std::make_pair(&NextMI, MIIndex)); + return; + } else { + // FIXME: Eventually we want to actually delete these indexes. + MIEntry.setInstr(nullptr); + } +} + void SlotIndexes::renumberIndexes() { // Renumber updates the index of every element of the index list. DEBUG(dbgs() << "\n*** Renumbering SlotIndexes ***\n"); diff --git a/contrib/llvm/lib/CodeGen/SpillPlacement.cpp b/contrib/llvm/lib/CodeGen/SpillPlacement.cpp index f10c98e..0abe1c4 100644 --- a/contrib/llvm/lib/CodeGen/SpillPlacement.cpp +++ b/contrib/llvm/lib/CodeGen/SpillPlacement.cpp @@ -40,14 +40,14 @@ using namespace llvm; -#define DEBUG_TYPE "spillplacement" +#define DEBUG_TYPE "spill-code-placement" char SpillPlacement::ID = 0; -INITIALIZE_PASS_BEGIN(SpillPlacement, "spill-code-placement", +INITIALIZE_PASS_BEGIN(SpillPlacement, DEBUG_TYPE, "Spill Code Placement Analysis", true, true) INITIALIZE_PASS_DEPENDENCY(EdgeBundles) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(SpillPlacement, "spill-code-placement", +INITIALIZE_PASS_END(SpillPlacement, DEBUG_TYPE, "Spill Code Placement Analysis", true, true) char &llvm::SpillPlacementID = SpillPlacement::ID; @@ -310,7 +310,7 @@ void SpillPlacement::addLinks(ArrayRef<unsigned> Links) { bool SpillPlacement::scanActiveBundles() { RecentPositive.clear(); - for (int n = ActiveNodes->find_first(); n>=0; n = ActiveNodes->find_next(n)) { + for (unsigned n : ActiveNodes->set_bits()) { update(n); // A node that must spill, or a node without any links is not going to // change its value ever again, so exclude it from iterations. @@ -365,7 +365,7 @@ SpillPlacement::finish() { // Write preferences back to ActiveNodes. bool Perfect = true; - for (int n = ActiveNodes->find_first(); n>=0; n = ActiveNodes->find_next(n)) + for (unsigned n : ActiveNodes->set_bits()) if (!nodes[n].preferReg()) { ActiveNodes->reset(n); Perfect = false; diff --git a/contrib/llvm/lib/CodeGen/SplitKit.cpp b/contrib/llvm/lib/CodeGen/SplitKit.cpp index 1c6a84e..323045f 100644 --- a/contrib/llvm/lib/CodeGen/SplitKit.cpp +++ b/contrib/llvm/lib/CodeGen/SplitKit.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/VirtRegMap.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" @@ -52,10 +53,10 @@ InsertPointAnalysis::computeLastInsertPoint(const LiveInterval &CurLI, std::pair<SlotIndex, SlotIndex> &LIP = LastInsertPoint[Num]; SlotIndex MBBEnd = LIS.getMBBEndIdx(&MBB); - SmallVector<const MachineBasicBlock *, 1> EHPadSucessors; + SmallVector<const MachineBasicBlock *, 1> EHPadSuccessors; for (const MachineBasicBlock *SMBB : MBB.successors()) if (SMBB->isEHPad()) - EHPadSucessors.push_back(SMBB); + EHPadSuccessors.push_back(SMBB); // Compute insert points on the first call. The pair is independent of the // current live interval. @@ -67,7 +68,7 @@ InsertPointAnalysis::computeLastInsertPoint(const LiveInterval &CurLI, LIP.first = LIS.getInstructionIndex(*FirstTerm); // If there is a landing pad successor, also find the call instruction. - if (EHPadSucessors.empty()) + if (EHPadSuccessors.empty()) return LIP.first; // There may not be a call instruction (?) in which case we ignore LPad. LIP.second = LIP.first; @@ -86,7 +87,7 @@ InsertPointAnalysis::computeLastInsertPoint(const LiveInterval &CurLI, if (!LIP.second) return LIP.first; - if (none_of(EHPadSucessors, [&](const MachineBasicBlock *EHPad) { + if (none_of(EHPadSuccessors, [&](const MachineBasicBlock *EHPad) { return LIS.isLiveInToMBB(CurLI, EHPad); })) return LIP.first; @@ -487,12 +488,125 @@ void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI) { VFP = ValueForcePair(nullptr, true); } +SlotIndex SplitEditor::buildSingleSubRegCopy(unsigned FromReg, unsigned ToReg, + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, + unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def) { + const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY); + bool FirstCopy = !Def.isValid(); + MachineInstr *CopyMI = BuildMI(MBB, InsertBefore, DebugLoc(), Desc) + .addReg(ToReg, RegState::Define | getUndefRegState(FirstCopy) + | getInternalReadRegState(!FirstCopy), SubIdx) + .addReg(FromReg, 0, SubIdx); + + BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator(); + if (FirstCopy) { + SlotIndexes &Indexes = *LIS.getSlotIndexes(); + Def = Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot(); + } else { + CopyMI->bundleWithPred(); + } + LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubIdx); + DestLI.refineSubRanges(Allocator, LaneMask, + [Def, &Allocator](LiveInterval::SubRange& SR) { + SR.createDeadDef(Def, Allocator); + }); + return Def; +} + +SlotIndex SplitEditor::buildCopy(unsigned FromReg, unsigned ToReg, + LaneBitmask LaneMask, MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, bool Late, unsigned RegIdx) { + const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY); + if (LaneMask.all() || LaneMask == MRI.getMaxLaneMaskForVReg(FromReg)) { + // The full vreg is copied. + MachineInstr *CopyMI = + BuildMI(MBB, InsertBefore, DebugLoc(), Desc, ToReg).addReg(FromReg); + SlotIndexes &Indexes = *LIS.getSlotIndexes(); + return Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot(); + } + + // Only a subset of lanes needs to be copied. The following is a simple + // heuristic to construct a sequence of COPYs. We could add a target + // specific callback if this turns out to be suboptimal. + LiveInterval &DestLI = LIS.getInterval(Edit->get(RegIdx)); + + // First pass: Try to find a perfectly matching subregister index. If none + // exists find the one covering the most lanemask bits. + SmallVector<unsigned, 8> PossibleIndexes; + unsigned BestIdx = 0; + unsigned BestCover = 0; + const TargetRegisterClass *RC = MRI.getRegClass(FromReg); + assert(RC == MRI.getRegClass(ToReg) && "Should have same reg class"); + for (unsigned Idx = 1, E = TRI.getNumSubRegIndices(); Idx < E; ++Idx) { + // Is this index even compatible with the given class? + if (TRI.getSubClassWithSubReg(RC, Idx) != RC) + continue; + LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(Idx); + // Early exit if we found a perfect match. + if (SubRegMask == LaneMask) { + BestIdx = Idx; + break; + } + + // The index must not cover any lanes outside \p LaneMask. + if ((SubRegMask & ~LaneMask).any()) + continue; + + unsigned PopCount = countPopulation(SubRegMask.getAsInteger()); + PossibleIndexes.push_back(Idx); + if (PopCount > BestCover) { + BestCover = PopCount; + BestIdx = Idx; + } + } + + // Abort if we cannot possibly implement the COPY with the given indexes. + if (BestIdx == 0) + report_fatal_error("Impossible to implement partial COPY"); + + SlotIndex Def = buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore, + BestIdx, DestLI, Late, SlotIndex()); + + // Greedy heuristic: Keep iterating keeping the best covering subreg index + // each time. + LaneBitmask LanesLeft = LaneMask & ~(TRI.getSubRegIndexLaneMask(BestIdx)); + while (LanesLeft.any()) { + unsigned BestIdx = 0; + int BestCover = INT_MIN; + for (unsigned Idx : PossibleIndexes) { + LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(Idx); + // Early exit if we found a perfect match. + if (SubRegMask == LanesLeft) { + BestIdx = Idx; + break; + } + + // Try to cover as much of the remaining lanes as possible but + // as few of the already covered lanes as possible. + int Cover = countPopulation((SubRegMask & LanesLeft).getAsInteger()) + - countPopulation((SubRegMask & ~LanesLeft).getAsInteger()); + if (Cover > BestCover) { + BestCover = Cover; + BestIdx = Idx; + } + } + + if (BestIdx == 0) + report_fatal_error("Impossible to implement partial COPY"); + + buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore, BestIdx, + DestLI, Late, Def); + LanesLeft &= ~TRI.getSubRegIndexLaneMask(BestIdx); + } + + return Def; +} + VNInfo *SplitEditor::defFromParent(unsigned RegIdx, VNInfo *ParentVNI, SlotIndex UseIdx, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { - MachineInstr *CopyMI = nullptr; SlotIndex Def; LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx)); @@ -505,24 +619,29 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx, LiveInterval &OrigLI = LIS.getInterval(Original); VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx); + unsigned Reg = LI->reg; bool DidRemat = false; if (OrigVNI) { LiveRangeEdit::Remat RM(ParentVNI); RM.OrigMI = LIS.getInstructionFromIndex(OrigVNI->def); if (Edit->canRematerializeAt(RM, OrigVNI, UseIdx, true)) { - Def = Edit->rematerializeAt(MBB, I, LI->reg, RM, TRI, Late); + Def = Edit->rematerializeAt(MBB, I, Reg, RM, TRI, Late); ++NumRemats; DidRemat = true; } } if (!DidRemat) { - // Can't remat, just insert a copy from parent. - CopyMI = BuildMI(MBB, I, DebugLoc(), TII.get(TargetOpcode::COPY), LI->reg) - .addReg(Edit->getReg()); - Def = LIS.getSlotIndexes() - ->insertMachineInstrInMaps(*CopyMI, Late) - .getRegSlot(); + LaneBitmask LaneMask; + if (LI->hasSubRanges()) { + LaneMask = LaneBitmask::getNone(); + for (LiveInterval::SubRange &S : LI->subranges()) + LaneMask |= S.LaneMask; + } else { + LaneMask = LaneBitmask::getAll(); + } + ++NumCopies; + Def = buildCopy(Edit->getReg(), Reg, LaneMask, MBB, I, Late, RegIdx); } // Define the value in Reg. diff --git a/contrib/llvm/lib/CodeGen/SplitKit.h b/contrib/llvm/lib/CodeGen/SplitKit.h index a75738a..9d409e9 100644 --- a/contrib/llvm/lib/CodeGen/SplitKit.h +++ b/contrib/llvm/lib/CodeGen/SplitKit.h @@ -405,6 +405,17 @@ private: /// deleteRematVictims - Delete defs that are dead after rematerializing. void deleteRematVictims(); + /// Add a copy instruction copying \p FromReg to \p ToReg before + /// \p InsertBefore. This can be invoked with a \p LaneMask which may make it + /// necessary to construct a sequence of copies to cover it exactly. + SlotIndex buildCopy(unsigned FromReg, unsigned ToReg, LaneBitmask LaneMask, + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, + bool Late, unsigned RegIdx); + + SlotIndex buildSingleSubRegCopy(unsigned FromReg, unsigned ToReg, + MachineBasicBlock &MB, MachineBasicBlock::iterator InsertBefore, + unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex PrevCopy); + public: /// Create a new SplitEditor for editing the LiveInterval analyzed by SA. /// Newly created intervals will be appended to newIntervals. diff --git a/contrib/llvm/lib/CodeGen/StackColoring.cpp b/contrib/llvm/lib/CodeGen/StackColoring.cpp index 89c4b57..e5fc540 100644 --- a/contrib/llvm/lib/CodeGen/StackColoring.cpp +++ b/contrib/llvm/lib/CodeGen/StackColoring.cpp @@ -23,7 +23,6 @@ #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" @@ -38,6 +37,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/StackProtector.h" #include "llvm/CodeGen/WinEHFuncInfo.h" @@ -54,7 +54,7 @@ using namespace llvm; -#define DEBUG_TYPE "stackcoloring" +#define DEBUG_TYPE "stack-coloring" static cl::opt<bool> DisableColoring("no-stack-coloring", @@ -87,10 +87,134 @@ STATISTIC(StackSpaceSaved, "Number of bytes saved due to merging slots."); STATISTIC(StackSlotMerged, "Number of stack slot merged."); STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region"); +//===----------------------------------------------------------------------===// +// StackColoring Pass +//===----------------------------------------------------------------------===// +// +// Stack Coloring reduces stack usage by merging stack slots when they +// can't be used together. For example, consider the following C program: +// +// void bar(char *, int); +// void foo(bool var) { +// A: { +// char z[4096]; +// bar(z, 0); +// } +// +// char *p; +// char x[4096]; +// char y[4096]; +// if (var) { +// p = x; +// } else { +// bar(y, 1); +// p = y + 1024; +// } +// B: +// bar(p, 2); +// } +// +// Naively-compiled, this program would use 12k of stack space. However, the +// stack slot corresponding to `z` is always destroyed before either of the +// stack slots for `x` or `y` are used, and then `x` is only used if `var` +// is true, while `y` is only used if `var` is false. So in no time are 2 +// of the stack slots used together, and therefore we can merge them, +// compiling the function using only a single 4k alloca: +// +// void foo(bool var) { // equivalent +// char x[4096]; +// char *p; +// bar(x, 0); +// if (var) { +// p = x; +// } else { +// bar(x, 1); +// p = x + 1024; +// } +// bar(p, 2); +// } +// +// This is an important optimization if we want stack space to be under +// control in large functions, both open-coded ones and ones created by +// inlining. // // Implementation Notes: // --------------------- // +// An important part of the above reasoning is that `z` can't be accessed +// while the latter 2 calls to `bar` are running. This is justified because +// `z`'s lifetime is over after we exit from block `A:`, so any further +// accesses to it would be UB. The way we represent this information +// in LLVM is by having frontends delimit blocks with `lifetime.start` +// and `lifetime.end` intrinsics. +// +// The effect of these intrinsics seems to be as follows (maybe I should +// specify this in the reference?): +// +// L1) at start, each stack-slot is marked as *out-of-scope*, unless no +// lifetime intrinsic refers to that stack slot, in which case +// it is marked as *in-scope*. +// L2) on a `lifetime.start`, a stack slot is marked as *in-scope* and +// the stack slot is overwritten with `undef`. +// L3) on a `lifetime.end`, a stack slot is marked as *out-of-scope*. +// L4) on function exit, all stack slots are marked as *out-of-scope*. +// L5) `lifetime.end` is a no-op when called on a slot that is already +// *out-of-scope*. +// L6) memory accesses to *out-of-scope* stack slots are UB. +// L7) when a stack-slot is marked as *out-of-scope*, all pointers to it +// are invalidated, unless the slot is "degenerate". This is used to +// justify not marking slots as in-use until the pointer to them is +// used, but feels a bit hacky in the presence of things like LICM. See +// the "Degenerate Slots" section for more details. +// +// Now, let's ground stack coloring on these rules. We'll define a slot +// as *in-use* at a (dynamic) point in execution if it either can be +// written to at that point, or if it has a live and non-undef content +// at that point. +// +// Obviously, slots that are never *in-use* together can be merged, and +// in our example `foo`, the slots for `x`, `y` and `z` are never +// in-use together (of course, sometimes slots that *are* in-use together +// might still be mergable, but we don't care about that here). +// +// In this implementation, we successively merge pairs of slots that are +// not *in-use* together. We could be smarter - for example, we could merge +// a single large slot with 2 small slots, or we could construct the +// interference graph and run a "smart" graph coloring algorithm, but with +// that aside, how do we find out whether a pair of slots might be *in-use* +// together? +// +// From our rules, we see that *out-of-scope* slots are never *in-use*, +// and from (L7) we see that "non-degenerate" slots remain non-*in-use* +// until their address is taken. Therefore, we can approximate slot activity +// using dataflow. +// +// A subtle point: naively, we might try to figure out which pairs of +// stack-slots interfere by propagating `S in-use` through the CFG for every +// stack-slot `S`, and having `S` and `T` interfere if there is a CFG point in +// which they are both *in-use*. +// +// That is sound, but overly conservative in some cases: in our (artificial) +// example `foo`, either `x` or `y` might be in use at the label `B:`, but +// as `x` is only in use if we came in from the `var` edge and `y` only +// if we came from the `!var` edge, they still can't be in use together. +// See PR32488 for an important real-life case. +// +// If we wanted to find all points of interference precisely, we could +// propagate `S in-use` and `S&T in-use` predicates through the CFG. That +// would be precise, but requires propagating `O(n^2)` dataflow facts. +// +// However, we aren't interested in the *set* of points of interference +// between 2 stack slots, only *whether* there *is* such a point. So we +// can rely on a little trick: for `S` and `T` to be in-use together, +// one of them needs to become in-use while the other is in-use (or +// they might both become in use simultaneously). We can check this +// by also keeping track of the points at which a stack slot might *start* +// being in-use. +// +// Exact first use: +// ---------------- +// // Consider the following motivating example: // // int foo() { @@ -159,6 +283,9 @@ STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region"); // lifetime, we can additionally overlap b1 and b5, giving us a 3*1024 // byte stack (better). // +// Degenerate Slots: +// ----------------- +// // Relying entirely on first-use of stack slots is problematic, // however, due to the fact that optimizations can sometimes migrate // uses of a variable outside of its lifetime start/end region. Here @@ -238,10 +365,6 @@ STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region"); // for "b" then it will appear that 'b' has a degenerate lifetime. // -//===----------------------------------------------------------------------===// -// StackColoring Pass -//===----------------------------------------------------------------------===// - namespace { /// StackColoring - A machine pass for merging disjoint stack allocations, /// marked by the LIFETIME_START and LIFETIME_END pseudo instructions. @@ -272,8 +395,11 @@ class StackColoring : public MachineFunctionPass { /// Maps basic blocks to a serial number. SmallVector<const MachineBasicBlock*, 8> BasicBlockNumbering; - /// Maps liveness intervals for each slot. + /// Maps slots to their use interval. Outside of this interval, slots + /// values are either dead or `undef` and they will not be written to. SmallVector<std::unique_ptr<LiveInterval>, 16> Intervals; + /// Maps slots to the points where they can become in-use. + SmallVector<SmallVector<SlotIndex, 4>, 16> LiveStarts; /// VNInfo is used for the construction of LiveIntervals. VNInfo::Allocator VNInfoAllocator; /// SlotIndex analysis object. @@ -372,12 +498,12 @@ private: char StackColoring::ID = 0; char &llvm::StackColoringID = StackColoring::ID; -INITIALIZE_PASS_BEGIN(StackColoring, - "stack-coloring", "Merge disjoint stack slots", false, false) +INITIALIZE_PASS_BEGIN(StackColoring, DEBUG_TYPE, + "Merge disjoint stack slots", false, false) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) INITIALIZE_PASS_DEPENDENCY(StackProtector) -INITIALIZE_PASS_END(StackColoring, - "stack-coloring", "Merge disjoint stack slots", false, false) +INITIALIZE_PASS_END(StackColoring, DEBUG_TYPE, + "Merge disjoint stack slots", false, false) void StackColoring::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<SlotIndexes>(); @@ -385,14 +511,13 @@ void StackColoring::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } -#ifndef NDEBUG - +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void StackColoring::dumpBV(const char *tag, const BitVector &BV) const { - DEBUG(dbgs() << tag << " : { "); + dbgs() << tag << " : { "; for (unsigned I = 0, E = BV.size(); I != E; ++I) - DEBUG(dbgs() << BV.test(I) << " "); - DEBUG(dbgs() << "}\n"); + dbgs() << BV.test(I) << " "; + dbgs() << "}\n"; } LLVM_DUMP_METHOD void StackColoring::dumpBB(MachineBasicBlock *MBB) const { @@ -408,20 +533,19 @@ LLVM_DUMP_METHOD void StackColoring::dumpBB(MachineBasicBlock *MBB) const { LLVM_DUMP_METHOD void StackColoring::dump() const { for (MachineBasicBlock *MBB : depth_first(MF)) { - DEBUG(dbgs() << "Inspecting block #" << MBB->getNumber() << " [" - << MBB->getName() << "]\n"); - DEBUG(dumpBB(MBB)); + dbgs() << "Inspecting block #" << MBB->getNumber() << " [" + << MBB->getName() << "]\n"; + dumpBB(MBB); } } LLVM_DUMP_METHOD void StackColoring::dumpIntervals() const { for (unsigned I = 0, E = Intervals.size(); I != E; ++I) { - DEBUG(dbgs() << "Interval[" << I << "]:\n"); - DEBUG(Intervals[I]->dump()); + dbgs() << "Interval[" << I << "]:\n"; + Intervals[I]->dump(); } } - -#endif // not NDEBUG +#endif static inline int getStartOrEndSlot(const MachineInstr &MI) { @@ -570,9 +694,8 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) // Step 2: compute begin/end sets for each block - // NOTE: We use a reverse-post-order iteration to ensure that we obtain a - // deterministic numbering, and because we'll need a post-order iteration - // later for solving the liveness dataflow problem. + // NOTE: We use a depth-first iteration to ensure that we obtain a + // deterministic numbering. for (MachineBasicBlock *MBB : depth_first(MF)) { // Assign a serial number to this basic block. @@ -676,15 +799,22 @@ void StackColoring::calculateLocalLiveness() void StackColoring::calculateLiveIntervals(unsigned NumSlots) { SmallVector<SlotIndex, 16> Starts; - SmallVector<SlotIndex, 16> Finishes; + SmallVector<bool, 16> DefinitelyInUse; // For each block, find which slots are active within this block // and update the live intervals. for (const MachineBasicBlock &MBB : *MF) { Starts.clear(); Starts.resize(NumSlots); - Finishes.clear(); - Finishes.resize(NumSlots); + DefinitelyInUse.clear(); + DefinitelyInUse.resize(NumSlots); + + // Start the interval of the slots that we previously found to be 'in-use'. + BlockLifetimeInfo &MBBLiveness = BlockLiveness[&MBB]; + for (int pos = MBBLiveness.LiveIn.find_first(); pos != -1; + pos = MBBLiveness.LiveIn.find_next(pos)) { + Starts[pos] = Indexes->getMBBStartIdx(&MBB); + } // Create the interval for the basic blocks containing lifetime begin/end. for (const MachineInstr &MI : MBB) { @@ -696,68 +826,35 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) { SlotIndex ThisIndex = Indexes->getInstructionIndex(MI); for (auto Slot : slots) { if (IsStart) { - if (!Starts[Slot].isValid() || Starts[Slot] > ThisIndex) + // If a slot is already definitely in use, we don't have to emit + // a new start marker because there is already a pre-existing + // one. + if (!DefinitelyInUse[Slot]) { + LiveStarts[Slot].push_back(ThisIndex); + DefinitelyInUse[Slot] = true; + } + if (!Starts[Slot].isValid()) Starts[Slot] = ThisIndex; } else { - if (!Finishes[Slot].isValid() || Finishes[Slot] < ThisIndex) - Finishes[Slot] = ThisIndex; + if (Starts[Slot].isValid()) { + VNInfo *VNI = Intervals[Slot]->getValNumInfo(0); + Intervals[Slot]->addSegment( + LiveInterval::Segment(Starts[Slot], ThisIndex, VNI)); + Starts[Slot] = SlotIndex(); // Invalidate the start index + DefinitelyInUse[Slot] = false; + } } } } - // Create the interval of the blocks that we previously found to be 'alive'. - BlockLifetimeInfo &MBBLiveness = BlockLiveness[&MBB]; - for (int pos = MBBLiveness.LiveIn.find_first(); pos != -1; - pos = MBBLiveness.LiveIn.find_next(pos)) { - Starts[pos] = Indexes->getMBBStartIdx(&MBB); - } - for (int pos = MBBLiveness.LiveOut.find_first(); pos != -1; - pos = MBBLiveness.LiveOut.find_next(pos)) { - Finishes[pos] = Indexes->getMBBEndIdx(&MBB); - } - + // Finish up started segments for (unsigned i = 0; i < NumSlots; ++i) { - // - // When LifetimeStartOnFirstUse is turned on, data flow analysis - // is forward (from starts to ends), not bidirectional. A - // consequence of this is that we can wind up in situations - // where Starts[i] is invalid but Finishes[i] is valid and vice - // versa. Example: - // - // LIFETIME_START x - // if (...) { - // <use of x> - // throw ...; - // } - // LIFETIME_END x - // return 2; - // - // - // Here the slot for "x" will not be live into the block - // containing the "return 2" (since lifetimes start with first - // use, not at the dominating LIFETIME_START marker). - // - if (Starts[i].isValid() && !Finishes[i].isValid()) { - Finishes[i] = Indexes->getMBBEndIdx(&MBB); - } if (!Starts[i].isValid()) continue; - assert(Starts[i] && Finishes[i] && "Invalid interval"); - VNInfo *ValNum = Intervals[i]->getValNumInfo(0); - SlotIndex S = Starts[i]; - SlotIndex F = Finishes[i]; - if (S < F) { - // We have a single consecutive region. - Intervals[i]->addSegment(LiveInterval::Segment(S, F, ValNum)); - } else { - // We have two non-consecutive regions. This happens when - // LIFETIME_START appears after the LIFETIME_END marker. - SlotIndex NewStart = Indexes->getMBBStartIdx(&MBB); - SlotIndex NewFin = Indexes->getMBBEndIdx(&MBB); - Intervals[i]->addSegment(LiveInterval::Segment(NewStart, F, ValNum)); - Intervals[i]->addSegment(LiveInterval::Segment(S, NewFin, ValNum)); - } + SlotIndex EndIdx = Indexes->getMBBEndIdx(&MBB); + VNInfo *VNI = Intervals[i]->getValNumInfo(0); + Intervals[i]->addSegment(LiveInterval::Segment(Starts[i], EndIdx, VNI)); } } } @@ -793,6 +890,10 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) { // Keep a list of *allocas* which need to be remapped. DenseMap<const AllocaInst*, const AllocaInst*> Allocas; + + // Keep a list of allocas which has been affected by the remap. + SmallPtrSet<const AllocaInst*, 32> MergedAllocas; + for (const std::pair<int, int> &SI : SlotRemap) { const AllocaInst *From = MFI->getObjectAllocation(SI.first); const AllocaInst *To = MFI->getObjectAllocation(SI.second); @@ -812,6 +913,10 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) { Inst = Cast; } + // We keep both slots to maintain AliasAnalysis metadata later. + MergedAllocas.insert(From); + MergedAllocas.insert(To); + // Allow the stack protector to adjust its value map to account for the // upcoming replacement. SP->adjustForColoring(From, To); @@ -843,13 +948,6 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) { // Update the MachineMemOperand to use the new alloca. for (MachineMemOperand *MMO : I.memoperands()) { - // FIXME: In order to enable the use of TBAA when using AA in CodeGen, - // we'll also need to update the TBAA nodes in MMOs with values - // derived from the merged allocas. When doing this, we'll need to use - // the same variant of GetUnderlyingObjects that is used by the - // instruction scheduler (that can look through ptrtoint/inttoptr - // pairs). - // We've replaced IR-level uses of the remapped allocas, so we only // need to replace direct uses here. const AllocaInst *AI = dyn_cast_or_null<AllocaInst>(MMO->getValue()); @@ -901,6 +999,48 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) { MO.setIndex(ToSlot); FixedInstr++; } + + // We adjust AliasAnalysis information for merged stack slots. + MachineSDNode::mmo_iterator NewMemOps = + MF->allocateMemRefsArray(I.getNumMemOperands()); + unsigned MemOpIdx = 0; + bool ReplaceMemOps = false; + for (MachineMemOperand *MMO : I.memoperands()) { + // If this memory location can be a slot remapped here, + // we remove AA information. + bool MayHaveConflictingAAMD = false; + if (MMO->getAAInfo()) { + if (const Value *MMOV = MMO->getValue()) { + SmallVector<Value *, 4> Objs; + getUnderlyingObjectsForCodeGen(MMOV, Objs, MF->getDataLayout()); + + if (Objs.empty()) + MayHaveConflictingAAMD = true; + else + for (Value *V : Objs) { + // If this memory location comes from a known stack slot + // that is not remapped, we continue checking. + // Otherwise, we need to invalidate AA infomation. + const AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V); + if (AI && MergedAllocas.count(AI)) { + MayHaveConflictingAAMD = true; + break; + } + } + } + } + if (MayHaveConflictingAAMD) { + NewMemOps[MemOpIdx++] = MF->getMachineMemOperand(MMO, AAMDNodes()); + ReplaceMemOps = true; + } + else + NewMemOps[MemOpIdx++] = MMO; + } + + // If any memory operand is updated, set memory references of + // this instruction. + if (ReplaceMemOps) + I.setMemRefs(std::make_pair(NewMemOps, I.getNumMemOperands())); } // Update the location of C++ catch objects for the MSVC personality routine. @@ -987,6 +1127,7 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) { BasicBlockNumbering.clear(); Markers.clear(); Intervals.clear(); + LiveStarts.clear(); VNInfoAllocator.Reset(); unsigned NumSlots = MFI->getObjectIndexEnd(); @@ -998,6 +1139,7 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) { SmallVector<int, 8> SortedSlots; SortedSlots.reserve(NumSlots); Intervals.reserve(NumSlots); + LiveStarts.resize(NumSlots); unsigned NumMarkers = collectMarkers(NumSlots); @@ -1069,6 +1211,9 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) { return MFI->getObjectSize(LHS) > MFI->getObjectSize(RHS); }); + for (auto &s : LiveStarts) + std::sort(s.begin(), s.end()); + bool Changed = true; while (Changed) { Changed = false; @@ -1084,12 +1229,22 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) { int SecondSlot = SortedSlots[J]; LiveInterval *First = &*Intervals[FirstSlot]; LiveInterval *Second = &*Intervals[SecondSlot]; + auto &FirstS = LiveStarts[FirstSlot]; + auto &SecondS = LiveStarts[SecondSlot]; assert (!First->empty() && !Second->empty() && "Found an empty range"); - // Merge disjoint slots. - if (!First->overlaps(*Second)) { + // Merge disjoint slots. This is a little bit tricky - see the + // Implementation Notes section for an explanation. + if (!First->isLiveAtIndexes(SecondS) && + !Second->isLiveAtIndexes(FirstS)) { Changed = true; First->MergeSegmentsInAsValue(*Second, First->getValNumInfo(0)); + + int OldSize = FirstS.size(); + FirstS.append(SecondS.begin(), SecondS.end()); + auto Mid = FirstS.begin() + OldSize; + std::inplace_merge(FirstS.begin(), Mid, FirstS.end()); + SlotRemap[SecondSlot] = FirstSlot; SortedSlots[J] = -1; DEBUG(dbgs()<<"Merging #"<<FirstSlot<<" and slots #"<< diff --git a/contrib/llvm/lib/CodeGen/StackMaps.cpp b/contrib/llvm/lib/CodeGen/StackMaps.cpp index 9b7dd400..b4fa29d 100644 --- a/contrib/llvm/lib/CodeGen/StackMaps.cpp +++ b/contrib/llvm/lib/CodeGen/StackMaps.cpp @@ -1,4 +1,4 @@ -//===---------------------------- StackMaps.cpp ---------------------------===// +//===- StackMaps.cpp ------------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -8,30 +8,41 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/StackMaps.h" +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Twine.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/IR/DataLayout.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCObjectFileInfo.h" -#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Target/TargetMachine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOpcodes.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> #include <iterator> +#include <utility> using namespace llvm; #define DEBUG_TYPE "stackmaps" static cl::opt<int> StackMapVersion( - "stackmap-version", cl::init(2), - cl::desc("Specify the stackmap encoding version (default = 2)")); + "stackmap-version", cl::init(3), + cl::desc("Specify the stackmap encoding version (default = 3)")); const char *StackMaps::WSMP = "Stack Maps: "; @@ -74,7 +85,7 @@ unsigned PatchPointOpers::getNextScratchIdx(unsigned StartIdx) const { } StackMaps::StackMaps(AsmPrinter &AP) : AP(AP) { - if (StackMapVersion != 2) + if (StackMapVersion != 3) llvm_unreachable("Unsupported stackmap version!"); } @@ -150,7 +161,8 @@ StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI, if (SubRegIdx) Offset = TRI->getSubRegIdxOffset(SubRegIdx); - Locs.emplace_back(Location::Register, RC->getSize(), DwarfRegNum, Offset); + Locs.emplace_back(Location::Register, TRI->getSpillSize(*RC), + DwarfRegNum, Offset); return ++MOI; } @@ -209,8 +221,9 @@ void StackMaps::print(raw_ostream &OS) { OS << "Constant Index " << Loc.Offset; break; } - OS << "\t[encoding: .byte " << Loc.Type << ", .byte " << Loc.Size - << ", .short " << Loc.Reg << ", .int " << Loc.Offset << "]\n"; + OS << "\t[encoding: .byte " << Loc.Type << ", .byte 0" + << ", .short " << Loc.Size << ", .short " << Loc.Reg << ", .short 0" + << ", .int " << Loc.Offset << "]\n"; Idx++; } @@ -234,7 +247,7 @@ void StackMaps::print(raw_ostream &OS) { StackMaps::LiveOutReg StackMaps::createLiveOutReg(unsigned Reg, const TargetRegisterInfo *TRI) const { unsigned DwarfRegNum = getDwarfRegNum(Reg, TRI); - unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize(); + unsigned Size = TRI->getSpillSize(*TRI->getMinimalPhysRegClass(Reg)); return LiveOutReg(Reg, DwarfRegNum, Size); } @@ -276,7 +289,8 @@ StackMaps::parseRegisterLiveOutMask(const uint32_t *Mask) const { } LiveOuts.erase( - remove_if(LiveOuts, [](const LiveOutReg &LO) { return LO.Reg == 0; }), + llvm::remove_if(LiveOuts, + [](const LiveOutReg &LO) { return LO.Reg == 0; }), LiveOuts.end()); return LiveOuts; @@ -286,7 +300,6 @@ void StackMaps::recordStackMapOpers(const MachineInstr &MI, uint64_t ID, MachineInstr::const_mop_iterator MOI, MachineInstr::const_mop_iterator MOE, bool recordResult) { - MCContext &OutContext = AP.OutStreamer->getContext(); MCSymbol *MILabel = OutContext.createTempSymbol(); AP.OutStreamer->EmitLabel(MILabel); @@ -378,6 +391,7 @@ void StackMaps::recordPatchPoint(const MachineInstr &MI) { } #endif } + void StackMaps::recordStatepoint(const MachineInstr &MI) { assert(MI.getOpcode() == TargetOpcode::STATEPOINT && "expected statepoint"); @@ -508,11 +522,16 @@ void StackMaps::emitCallsiteEntries(MCStreamer &OS) { for (const auto &Loc : CSLocs) { OS.EmitIntValue(Loc.Type, 1); - OS.EmitIntValue(Loc.Size, 1); + OS.EmitIntValue(0, 1); // Reserved + OS.EmitIntValue(Loc.Size, 2); OS.EmitIntValue(Loc.Reg, 2); + OS.EmitIntValue(0, 2); // Reserved OS.EmitIntValue(Loc.Offset, 4); } + // Emit alignment to 8 byte. + OS.EmitValueToAlignment(8); + // Num live-out registers and padding to align to 4 byte. OS.EmitIntValue(0, 2); OS.EmitIntValue(LiveOuts.size(), 2); diff --git a/contrib/llvm/lib/CodeGen/StackProtector.cpp b/contrib/llvm/lib/CodeGen/StackProtector.cpp index c2c010a..d8e7840 100644 --- a/contrib/llvm/lib/CodeGen/StackProtector.cpp +++ b/contrib/llvm/lib/CodeGen/StackProtector.cpp @@ -1,4 +1,4 @@ -//===-- StackProtector.cpp - Stack Protector Insertion --------------------===// +//===- StackProtector.cpp - Stack Protector Insertion ---------------------===// // // The LLVM Compiler Infrastructure // @@ -14,30 +14,40 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/StackProtector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/EHPersonalities.h" -#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/StackProtector.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetSubtargetInfo.h" -#include <cstdlib> +#include <utility> + using namespace llvm; #define DEBUG_TYPE "stack-protector" @@ -50,12 +60,14 @@ static cl::opt<bool> EnableSelectionDAGSP("enable-selectiondag-sp", cl::init(true), cl::Hidden); char StackProtector::ID = 0; -INITIALIZE_TM_PASS(StackProtector, "stack-protector", "Insert stack protectors", - false, true) -FunctionPass *llvm::createStackProtectorPass(const TargetMachine *TM) { - return new StackProtector(TM); -} +INITIALIZE_PASS_BEGIN(StackProtector, DEBUG_TYPE, + "Insert stack protectors", false, true) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(StackProtector, DEBUG_TYPE, + "Insert stack protectors", false, true) + +FunctionPass *llvm::createStackProtectorPass() { return new StackProtector(); } StackProtector::SSPLayoutKind StackProtector::getSSPLayout(const AllocaInst *AI) const { @@ -83,12 +95,19 @@ void StackProtector::adjustForColoring(const AllocaInst *From, } } +void StackProtector::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetPassConfig>(); + AU.addPreserved<DominatorTreeWrapperPass>(); +} + bool StackProtector::runOnFunction(Function &Fn) { F = &Fn; M = F->getParent(); DominatorTreeWrapperPass *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); DT = DTWP ? &DTWP->getDomTree() : nullptr; + TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); + Trip = TM->getTargetTriple(); TLI = TM->getSubtargetImpl(Fn)->getTargetLowering(); HasPrologue = false; HasIRCheck = false; @@ -222,7 +241,16 @@ bool StackProtector::RequiresStackProtector() { if (F->hasFnAttribute(Attribute::SafeStack)) return false; + // We are constructing the OptimizationRemarkEmitter on the fly rather than + // using the analysis pass to avoid building DominatorTree and LoopInfo which + // are not available this late in the IR pipeline. + OptimizationRemarkEmitter ORE(F); + if (F->hasFnAttribute(Attribute::StackProtectReq)) { + ORE.emit(OptimizationRemark(DEBUG_TYPE, "StackProtectorRequested", F) + << "Stack protection applied to function " + << ore::NV("Function", F) + << " due to a function attribute or command-line switch"); NeedsProtector = true; Strong = true; // Use the same heuristic as strong to determine SSPLayout } else if (F->hasFnAttribute(Attribute::StackProtectStrong)) @@ -236,20 +264,29 @@ bool StackProtector::RequiresStackProtector() { for (const Instruction &I : BB) { if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) { if (AI->isArrayAllocation()) { + OptimizationRemark Remark(DEBUG_TYPE, "StackProtectorAllocaOrArray", + &I); + Remark + << "Stack protection applied to function " + << ore::NV("Function", F) + << " due to a call to alloca or use of a variable length array"; if (const auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) { if (CI->getLimitedValue(SSPBufferSize) >= SSPBufferSize) { // A call to alloca with size >= SSPBufferSize requires // stack protectors. Layout.insert(std::make_pair(AI, SSPLK_LargeArray)); + ORE.emit(Remark); NeedsProtector = true; } else if (Strong) { // Require protectors for all alloca calls in strong mode. Layout.insert(std::make_pair(AI, SSPLK_SmallArray)); + ORE.emit(Remark); NeedsProtector = true; } } else { // A call to alloca with a variable size requires protectors. Layout.insert(std::make_pair(AI, SSPLK_LargeArray)); + ORE.emit(Remark); NeedsProtector = true; } continue; @@ -259,6 +296,11 @@ bool StackProtector::RequiresStackProtector() { if (ContainsProtectableArray(AI->getAllocatedType(), IsLarge, Strong)) { Layout.insert(std::make_pair(AI, IsLarge ? SSPLK_LargeArray : SSPLK_SmallArray)); + ORE.emit(OptimizationRemark(DEBUG_TYPE, "StackProtectorBuffer", &I) + << "Stack protection applied to function " + << ore::NV("Function", F) + << " due to a stack allocated buffer or struct containing a " + "buffer"); NeedsProtector = true; continue; } @@ -266,6 +308,11 @@ bool StackProtector::RequiresStackProtector() { if (Strong && HasAddressTaken(AI)) { ++NumAddrTaken; Layout.insert(std::make_pair(AI, SSPLK_AddrOf)); + ORE.emit( + OptimizationRemark(DEBUG_TYPE, "StackProtectorAddressTaken", &I) + << "Stack protection applied to function " + << ore::NV("Function", F) + << " due to the address of a local variable being taken"); NeedsProtector = true; } } @@ -448,13 +495,13 @@ BasicBlock *StackProtector::CreateFailBB() { Constant *StackChkFail = M->getOrInsertFunction("__stack_smash_handler", Type::getVoidTy(Context), - Type::getInt8PtrTy(Context), nullptr); + Type::getInt8PtrTy(Context)); B.CreateCall(StackChkFail, B.CreateGlobalStringPtr(F->getName(), "SSH")); } else { Constant *StackChkFail = - M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context), - nullptr); + M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context)); + B.CreateCall(StackChkFail, {}); } B.CreateUnreachable(); diff --git a/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp b/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp index 234b204..856bca1 100644 --- a/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp +++ b/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp @@ -11,7 +11,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -22,6 +21,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/IR/Module.h" #include "llvm/Support/CommandLine.h" @@ -32,7 +32,7 @@ #include <vector> using namespace llvm; -#define DEBUG_TYPE "stackslotcoloring" +#define DEBUG_TYPE "stack-slot-coloring" static cl::opt<bool> DisableSharing("no-stack-slot-sharing", @@ -116,12 +116,12 @@ namespace { char StackSlotColoring::ID = 0; char &llvm::StackSlotColoringID = StackSlotColoring::ID; -INITIALIZE_PASS_BEGIN(StackSlotColoring, "stack-slot-coloring", +INITIALIZE_PASS_BEGIN(StackSlotColoring, DEBUG_TYPE, "Stack Slot Coloring", false, false) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) INITIALIZE_PASS_DEPENDENCY(LiveStacks) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(StackSlotColoring, "stack-slot-coloring", +INITIALIZE_PASS_END(StackSlotColoring, DEBUG_TYPE, "Stack Slot Coloring", false, false) namespace { diff --git a/contrib/llvm/lib/CodeGen/TailDuplication.cpp b/contrib/llvm/lib/CodeGen/TailDuplication.cpp index e2377d8..489a607 100644 --- a/contrib/llvm/lib/CodeGen/TailDuplication.cpp +++ b/contrib/llvm/lib/CodeGen/TailDuplication.cpp @@ -1,4 +1,4 @@ -//===-- TailDuplication.cpp - Duplicate blocks into predecessors' tails ---===// +//===- TailDuplication.cpp - Duplicate blocks into predecessors' tails ----===// // // The LLVM Compiler Infrastructure // @@ -12,22 +12,25 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TailDuplicator.h" -#include "llvm/IR/Function.h" -#include "llvm/Support/Debug.h" +#include "llvm/Pass.h" + using namespace llvm; #define DEBUG_TYPE "tailduplication" namespace { + /// Perform tail duplication. Delegates to TailDuplicator class TailDuplicatePass : public MachineFunctionPass { TailDuplicator Duplicator; public: static char ID; + explicit TailDuplicatePass() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -35,13 +38,13 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override; }; +} // end anonymous namespace + char TailDuplicatePass::ID = 0; -} char &llvm::TailDuplicateID = TailDuplicatePass::ID; -INITIALIZE_PASS(TailDuplicatePass, "tailduplication", "Tail Duplication", false, - false) +INITIALIZE_PASS(TailDuplicatePass, DEBUG_TYPE, "Tail Duplication", false, false) bool TailDuplicatePass::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) diff --git a/contrib/llvm/lib/CodeGen/TailDuplicator.cpp b/contrib/llvm/lib/CodeGen/TailDuplicator.cpp index 7709236..dc7265d 100644 --- a/contrib/llvm/lib/CodeGen/TailDuplicator.cpp +++ b/contrib/llvm/lib/CodeGen/TailDuplicator.cpp @@ -1,4 +1,4 @@ -//===-- TailDuplicator.cpp - Duplicate blocks into predecessors' tails ---===// +//===- TailDuplicator.cpp - Duplicate blocks into predecessors' tails -----===// // // The LLVM Compiler Infrastructure // @@ -12,22 +12,36 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/TailDuplicator.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SetVector.h" -#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" -#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSSAUpdater.h" +#include "llvm/CodeGen/TailDuplicator.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/Function.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <iterator> +#include <utility> + using namespace llvm; #define DEBUG_TYPE "tailduplication" @@ -41,15 +55,13 @@ STATISTIC(NumTailDupRemoved, STATISTIC(NumDeadBlocks, "Number of dead blocks removed"); STATISTIC(NumAddedPHIs, "Number of phis added"); -namespace llvm { - // Heuristic for tail duplication. static cl::opt<unsigned> TailDuplicateSize( "tail-dup-size", cl::desc("Maximum instructions to consider tail duplicating"), cl::init(2), cl::Hidden); -cl::opt<unsigned> TailDupIndirectBranchSize( +static cl::opt<unsigned> TailDupIndirectBranchSize( "tail-dup-indirect-size", cl::desc("Maximum instructions to consider tail duplicating blocks that " "end with indirect branches."), cl::init(20), @@ -138,7 +150,7 @@ bool TailDuplicator::tailDuplicateAndUpdate( bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl<MachineBasicBlock*> *DuplicatedPreds, - llvm::function_ref<void(MachineBasicBlock *)> *RemovalCallback) { + function_ref<void(MachineBasicBlock *)> *RemovalCallback) { // Save the successors list. SmallSetVector<MachineBasicBlock *, 8> Succs(MBB->succ_begin(), MBB->succ_end()); @@ -725,6 +737,7 @@ bool TailDuplicator::duplicateSimpleBB( if (PredTBB == NextBB && PredFBB == nullptr) PredTBB = nullptr; + auto DL = PredBB->findBranchDebugLoc(); TII->removeBranch(*PredBB); if (!PredBB->isSuccessor(NewTarget)) @@ -735,7 +748,7 @@ bool TailDuplicator::duplicateSimpleBB( } if (PredTBB) - TII->insertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc()); + TII->insertBranch(*PredBB, PredTBB, PredFBB, PredCond, DL); TDBBs.push_back(PredBB); } @@ -748,7 +761,7 @@ bool TailDuplicator::canTailDuplicate(MachineBasicBlock *TailBB, if (PredBB->succ_size() > 1) return false; - MachineBasicBlock *PredTBB, *PredFBB; + MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr; SmallVector<MachineOperand, 4> PredCond; if (TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond)) return false; @@ -831,7 +844,7 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB, appendCopies(PredBB, CopyInfos, Copies); // Simplify - MachineBasicBlock *PredTBB, *PredFBB; + MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr; SmallVector<MachineOperand, 4> PredCond; TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond); @@ -970,7 +983,7 @@ void TailDuplicator::appendCopies(MachineBasicBlock *MBB, /// the CFG. void TailDuplicator::removeDeadBlock( MachineBasicBlock *MBB, - llvm::function_ref<void(MachineBasicBlock *)> *RemovalCallback) { + function_ref<void(MachineBasicBlock *)> *RemovalCallback) { assert(MBB->pred_empty() && "MBB must be dead!"); DEBUG(dbgs() << "\nRemoving MBB: " << *MBB); @@ -984,5 +997,3 @@ void TailDuplicator::removeDeadBlock( // Remove the block. MBB->eraseFromParent(); } - -} // End llvm namespace diff --git a/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp index f082add..9dd98b4 100644 --- a/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp +++ b/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp @@ -1,4 +1,4 @@ -//===----- TargetFrameLoweringImpl.cpp - Implement target frame interface --==// +//===- TargetFrameLoweringImpl.cpp - Implement target frame interface ------==// // // The LLVM Compiler Infrastructure // @@ -14,19 +14,21 @@ #include "llvm/ADT/BitVector.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Attributes.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Function.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Compiler.h" #include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" -#include <cstdlib> + using namespace llvm; -TargetFrameLowering::~TargetFrameLowering() { -} +TargetFrameLowering::~TargetFrameLowering() = default; /// The default implementation just looks at attribute "no-frame-pointer-elim". bool TargetFrameLowering::noFramePointerElim(const MachineFunction &MF) const { @@ -73,7 +75,7 @@ void TargetFrameLowering::determineCalleeSaves(MachineFunction &MF, return; // Get the callee saved register list... - const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF); + const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); // Early exit if there are no callee saved registers. if (!CSRegs || CSRegs[0] == 0) diff --git a/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp b/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp index 01f91b9..14c5adc 100644 --- a/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -345,12 +345,12 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC, unsigned SubIdx, unsigned &Size, unsigned &Offset, const MachineFunction &MF) const { + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); if (!SubIdx) { - Size = RC->getSize(); + Size = TRI->getSpillSize(*RC); Offset = 0; return true; } - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); unsigned BitSize = TRI->getSubRegIdxSize(SubIdx); // Convert bit size to byte size to be consistent with // MCRegisterClass::getSize(). @@ -364,10 +364,10 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC, Size = BitSize /= 8; Offset = (unsigned)BitOffset / 8; - assert(RC->getSize() >= (Offset + Size) && "bad subregister range"); + assert(TRI->getSpillSize(*RC) >= (Offset + Size) && "bad subregister range"); if (!MF.getDataLayout().isLittleEndian()) { - Offset = RC->getSize() - (Offset + Size); + Offset = TRI->getSpillSize(*RC) - (Offset + Size); } return true; } @@ -428,8 +428,8 @@ static const TargetRegisterClass *canFoldCopy(const MachineInstr &MI, return nullptr; } -void TargetInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { - llvm_unreachable("Not a MachO target"); +void TargetInstrInfo::getNoop(MCInst &NopInst) const { + llvm_unreachable("Not implemented"); } static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI, @@ -470,7 +470,7 @@ static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI, // No need to fold return, the meta data, and function arguments for (unsigned i = 0; i < StartIdx; ++i) - MIB.addOperand(MI.getOperand(i)); + MIB.add(MI.getOperand(i)); for (unsigned i = StartIdx; i < MI.getNumOperands(); ++i) { MachineOperand &MO = MI.getOperand(i); @@ -490,7 +490,7 @@ static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI, MIB.addImm(SpillOffset); } else - MIB.addOperand(MO); + MIB.add(MO); } return NewMI; } @@ -941,12 +941,10 @@ int TargetInstrInfo::getSPAdjust(const MachineInstr &MI) const { unsigned FrameSetupOpcode = getCallFrameSetupOpcode(); unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode(); - if (MI.getOpcode() != FrameSetupOpcode && - MI.getOpcode() != FrameDestroyOpcode) + if (!isFrameInstr(MI)) return 0; - int SPAdj = MI.getOperand(0).getImm(); - SPAdj = TFI->alignSPAdjust(SPAdj); + int SPAdj = TFI->alignSPAdjust(getFrameSize(MI)); if ((!StackGrowsDown && MI.getOpcode() == FrameSetupOpcode) || (StackGrowsDown && MI.getOpcode() == FrameDestroyOpcode)) diff --git a/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp b/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp index 003311b..3914ee5 100644 --- a/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -11,7 +11,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Target/TargetLowering.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" @@ -21,6 +20,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" @@ -33,6 +33,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" @@ -53,6 +54,18 @@ static cl::opt<unsigned> MaximumJumpTableSize ("max-jump-table-size", cl::init(0), cl::Hidden, cl::desc("Set maximum size of jump tables; zero for no limit.")); +/// Minimum jump table density for normal functions. +static cl::opt<unsigned> + JumpTableDensity("jump-table-density", cl::init(10), cl::Hidden, + cl::desc("Minimum density for building a jump table in " + "a normal function")); + +/// Minimum jump table density for -Os or -Oz functions. +static cl::opt<unsigned> OptsizeJumpTableDensity( + "optsize-jump-table-density", cl::init(40), cl::Hidden, + cl::desc("Minimum density for building a jump table in " + "an optsize function")); + // Although this default value is arbitrary, it is not random. It is assumed // that a condition that evaluates the same way by a higher percentage than this // is best represented as control flow. Therefore, the default value N should be @@ -361,11 +374,36 @@ static void InitLibcallNames(const char **Names, const Triple &TT) { Names[RTLIB::MEMCPY] = "memcpy"; Names[RTLIB::MEMMOVE] = "memmove"; Names[RTLIB::MEMSET] = "memset"; - Names[RTLIB::MEMCPY_ELEMENT_ATOMIC_1] = "__llvm_memcpy_element_atomic_1"; - Names[RTLIB::MEMCPY_ELEMENT_ATOMIC_2] = "__llvm_memcpy_element_atomic_2"; - Names[RTLIB::MEMCPY_ELEMENT_ATOMIC_4] = "__llvm_memcpy_element_atomic_4"; - Names[RTLIB::MEMCPY_ELEMENT_ATOMIC_8] = "__llvm_memcpy_element_atomic_8"; - Names[RTLIB::MEMCPY_ELEMENT_ATOMIC_16] = "__llvm_memcpy_element_atomic_16"; + Names[RTLIB::MEMCPY_ELEMENT_UNORDERED_ATOMIC_1] = + "__llvm_memcpy_element_unordered_atomic_1"; + Names[RTLIB::MEMCPY_ELEMENT_UNORDERED_ATOMIC_2] = + "__llvm_memcpy_element_unordered_atomic_2"; + Names[RTLIB::MEMCPY_ELEMENT_UNORDERED_ATOMIC_4] = + "__llvm_memcpy_element_unordered_atomic_4"; + Names[RTLIB::MEMCPY_ELEMENT_UNORDERED_ATOMIC_8] = + "__llvm_memcpy_element_unordered_atomic_8"; + Names[RTLIB::MEMCPY_ELEMENT_UNORDERED_ATOMIC_16] = + "__llvm_memcpy_element_unordered_atomic_16"; + Names[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_1] = + "__llvm_memmove_element_unordered_atomic_1"; + Names[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_2] = + "__llvm_memmove_element_unordered_atomic_2"; + Names[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_4] = + "__llvm_memmove_element_unordered_atomic_4"; + Names[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_8] = + "__llvm_memmove_element_unordered_atomic_8"; + Names[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_16] = + "__llvm_memmove_element_unordered_atomic_16"; + Names[RTLIB::MEMSET_ELEMENT_UNORDERED_ATOMIC_1] = + "__llvm_memset_element_unordered_atomic_1"; + Names[RTLIB::MEMSET_ELEMENT_UNORDERED_ATOMIC_2] = + "__llvm_memset_element_unordered_atomic_2"; + Names[RTLIB::MEMSET_ELEMENT_UNORDERED_ATOMIC_4] = + "__llvm_memset_element_unordered_atomic_4"; + Names[RTLIB::MEMSET_ELEMENT_UNORDERED_ATOMIC_8] = + "__llvm_memset_element_unordered_atomic_8"; + Names[RTLIB::MEMSET_ELEMENT_UNORDERED_ATOMIC_16] = + "__llvm_memset_element_unordered_atomic_16"; Names[RTLIB::UNWIND_RESUME] = "_Unwind_Resume"; Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_1] = "__sync_val_compare_and_swap_1"; Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2] = "__sync_val_compare_and_swap_2"; @@ -768,22 +806,55 @@ RTLIB::Libcall RTLIB::getSYNC(unsigned Opc, MVT VT) { return UNKNOWN_LIBCALL; } -RTLIB::Libcall RTLIB::getMEMCPY_ELEMENT_ATOMIC(uint64_t ElementSize) { +RTLIB::Libcall RTLIB::getMEMCPY_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize) { + switch (ElementSize) { + case 1: + return MEMCPY_ELEMENT_UNORDERED_ATOMIC_1; + case 2: + return MEMCPY_ELEMENT_UNORDERED_ATOMIC_2; + case 4: + return MEMCPY_ELEMENT_UNORDERED_ATOMIC_4; + case 8: + return MEMCPY_ELEMENT_UNORDERED_ATOMIC_8; + case 16: + return MEMCPY_ELEMENT_UNORDERED_ATOMIC_16; + default: + return UNKNOWN_LIBCALL; + } +} + +RTLIB::Libcall RTLIB::getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize) { switch (ElementSize) { case 1: - return MEMCPY_ELEMENT_ATOMIC_1; + return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_1; case 2: - return MEMCPY_ELEMENT_ATOMIC_2; + return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_2; case 4: - return MEMCPY_ELEMENT_ATOMIC_4; + return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_4; case 8: - return MEMCPY_ELEMENT_ATOMIC_8; + return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_8; case 16: - return MEMCPY_ELEMENT_ATOMIC_16; + return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_16; default: return UNKNOWN_LIBCALL; } +} +RTLIB::Libcall RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize) { + switch (ElementSize) { + case 1: + return MEMSET_ELEMENT_UNORDERED_ATOMIC_1; + case 2: + return MEMSET_ELEMENT_UNORDERED_ATOMIC_2; + case 4: + return MEMSET_ELEMENT_UNORDERED_ATOMIC_4; + case 8: + return MEMSET_ELEMENT_UNORDERED_ATOMIC_8; + case 16: + return MEMSET_ELEMENT_UNORDERED_ATOMIC_16; + default: + return UNKNOWN_LIBCALL; + } } /// InitCmpLibcallCCs - Set default comparison libcall CC. @@ -829,16 +900,16 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) { initActions(); // Perform these initializations only once. - MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = 8; - MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize - = MaxStoresPerMemmoveOptSize = 4; + MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = + MaxLoadsPerMemcmp = 8; + MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize = + MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4; UseUnderscoreSetJmp = false; UseUnderscoreLongJmp = false; HasMultipleConditionRegisters = false; HasExtractBitsInsn = false; JumpIsExpensive = JumpIsExpensiveOverride; PredictableSelectIsExpensive = false; - MaskAndBranchFoldingIsLegal = false; EnableExtLdPromotion = false; HasFloatingPointExceptions = true; StackPointerRegisterToSaveRestore = 0; @@ -851,7 +922,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) { MinFunctionAlignment = 0; PrefFunctionAlignment = 0; PrefLoopAlignment = 0; - GatherAllAliasesMaxDepth = 6; + GatherAllAliasesMaxDepth = 18; MinStackArgumentAlignment = 1; // TODO: the default will be switched to 0 in the next commit, along // with the Target-specific changes necessary. @@ -901,6 +972,7 @@ void TargetLoweringBase::initActions() { setOperationAction(ISD::SMAX, VT, Expand); setOperationAction(ISD::UMIN, VT, Expand); setOperationAction(ISD::UMAX, VT, Expand); + setOperationAction(ISD::ABS, VT, Expand); // Overflow operations default to expand setOperationAction(ISD::SADDO, VT, Expand); @@ -910,6 +982,11 @@ void TargetLoweringBase::initActions() { setOperationAction(ISD::SMULO, VT, Expand); setOperationAction(ISD::UMULO, VT, Expand); + // ADDCARRY operations default to expand + setOperationAction(ISD::ADDCARRY, VT, Expand); + setOperationAction(ISD::SUBCARRY, VT, Expand); + setOperationAction(ISD::SETCCCARRY, VT, Expand); + // These default to Expand so they will be expanded to CTLZ/CTTZ by default. setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); @@ -918,6 +995,7 @@ void TargetLoweringBase::initActions() { // These library functions default to expand. setOperationAction(ISD::FROUND, VT, Expand); + setOperationAction(ISD::FPOWI, VT, Expand); // These operations default to expand for vector types. if (VT.isVector()) { @@ -1184,12 +1262,11 @@ static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT, /// isLegalRC - Return true if the value types that can be represented by the /// specified register class are all legal. -bool TargetLoweringBase::isLegalRC(const TargetRegisterClass *RC) const { - for (TargetRegisterClass::vt_iterator I = RC->vt_begin(), E = RC->vt_end(); - I != E; ++I) { +bool TargetLoweringBase::isLegalRC(const TargetRegisterInfo &TRI, + const TargetRegisterClass &RC) const { + for (auto I = TRI.legalclasstypes_begin(RC); *I != MVT::Other; ++I) if (isTypeLegal(*I)) return true; - } return false; } @@ -1227,7 +1304,7 @@ TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI, // Copy operands before the frame-index. for (unsigned i = 0; i < OperIdx; ++i) - MIB.addOperand(MI->getOperand(i)); + MIB.add(MI->getOperand(i)); // Add frame index operands recognized by stackmaps.cpp if (MFI.isStatepointSpillSlotObjectIndex(FI)) { // indirect-mem-ref tag, size, #FI, offset. @@ -1237,18 +1314,18 @@ TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI, assert(MI->getOpcode() == TargetOpcode::STATEPOINT && "sanity"); MIB.addImm(StackMaps::IndirectMemRefOp); MIB.addImm(MFI.getObjectSize(FI)); - MIB.addOperand(MI->getOperand(OperIdx)); + MIB.add(MI->getOperand(OperIdx)); MIB.addImm(0); } else { // direct-mem-ref tag, #FI, offset. // Used by patchpoint, and direct alloca arguments to statepoints MIB.addImm(StackMaps::DirectMemRefOp); - MIB.addOperand(MI->getOperand(OperIdx)); + MIB.add(MI->getOperand(OperIdx)); MIB.addImm(0); } // Copy the operands after the frame index. for (unsigned i = OperIdx + 1; i != MI->getNumOperands(); ++i) - MIB.addOperand(MI->getOperand(i)); + MIB.add(MI->getOperand(i)); // Inherit previous memory operands. MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); @@ -1296,12 +1373,12 @@ TargetLoweringBase::findRepresentativeClass(const TargetRegisterInfo *TRI, // Find the first legal register class with the largest spill size. const TargetRegisterClass *BestRC = RC; - for (int i = SuperRegRC.find_first(); i >= 0; i = SuperRegRC.find_next(i)) { + for (unsigned i : SuperRegRC.set_bits()) { const TargetRegisterClass *SuperRC = TRI->getRegClass(i); // We want the largest possible spill size. - if (SuperRC->getSize() <= BestRC->getSize()) + if (TRI->getSpillSize(*SuperRC) <= TRI->getSpillSize(*BestRC)) continue; - if (!isLegalRC(SuperRC)) + if (!isLegalRC(*TRI, *SuperRC)) continue; BestRC = SuperRC; } @@ -1437,6 +1514,7 @@ void TargetLoweringBase::computeRegisterProperties( } if (IsLegalWiderType) break; + LLVM_FALLTHROUGH; } case TypeWidenVector: { // Try to widen the vector. @@ -1454,6 +1532,7 @@ void TargetLoweringBase::computeRegisterProperties( } if (IsLegalWiderType) break; + LLVM_FALLTHROUGH; } case TypeSplitVector: case TypeScalarizeVector: { @@ -1589,7 +1668,7 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT /// type of the given function. This does not require a DAG or a return value, /// and is suitable for use before any DAGs for the function are constructed. /// TODO: Move this out of TargetLowering.cpp. -void llvm::GetReturnInfo(Type *ReturnType, AttributeSet attr, +void llvm::GetReturnInfo(Type *ReturnType, AttributeList attr, SmallVectorImpl<ISD::OutputArg> &Outs, const TargetLowering &TLI, const DataLayout &DL) { SmallVector<EVT, 4> ValueVTs; @@ -1601,9 +1680,9 @@ void llvm::GetReturnInfo(Type *ReturnType, AttributeSet attr, EVT VT = ValueVTs[j]; ISD::NodeType ExtendKind = ISD::ANY_EXTEND; - if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt)) + if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt)) ExtendKind = ISD::SIGN_EXTEND; - else if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt)) + else if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt)) ExtendKind = ISD::ZERO_EXTEND; // FIXME: C calling convention requires the return type to be promoted to @@ -1616,18 +1695,20 @@ void llvm::GetReturnInfo(Type *ReturnType, AttributeSet attr, VT = MinVT; } - unsigned NumParts = TLI.getNumRegisters(ReturnType->getContext(), VT); - MVT PartVT = TLI.getRegisterType(ReturnType->getContext(), VT); + unsigned NumParts = + TLI.getNumRegistersForCallingConv(ReturnType->getContext(), VT); + MVT PartVT = + TLI.getRegisterTypeForCallingConv(ReturnType->getContext(), VT); // 'inreg' on function refers to return value ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy(); - if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::InReg)) + if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::InReg)) Flags.setInReg(); // Propagate extension type if any - if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt)) + if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt)) Flags.setSExt(); - else if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt)) + else if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt)) Flags.setZExt(); for (unsigned i = 0; i < NumParts; ++i) @@ -1818,7 +1899,7 @@ Value *TargetLoweringBase::getSafeStackPointerLocation(IRBuilder<> &IRB) const { Module *M = IRB.GetInsertBlock()->getParent()->getParent(); Type *StackPtrTy = Type::getInt8PtrTy(M->getContext()); Value *Fn = M->getOrInsertFunction("__safestack_pointer_address", - StackPtrTy->getPointerTo(0), nullptr); + StackPtrTy->getPointerTo(0)); return IRB.CreateCall(Fn); } @@ -1902,6 +1983,10 @@ void TargetLoweringBase::setMinimumJumpTableEntries(unsigned Val) { MinimumJumpTableEntries = Val; } +unsigned TargetLoweringBase::getMinimumJumpTableDensity(bool OptForSize) const { + return OptForSize ? OptsizeJumpTableDensity : JumpTableDensity; +} + unsigned TargetLoweringBase::getMaximumJumpTableSize() const { return MaximumJumpTableSize; } @@ -1918,11 +2003,7 @@ void TargetLoweringBase::setMaximumJumpTableSize(unsigned Val) { /// override the target defaults. static StringRef getRecipEstimateForFunc(MachineFunction &MF) { const Function *F = MF.getFunction(); - StringRef RecipAttrName = "reciprocal-estimates"; - if (!F->hasFnAttribute(RecipAttrName)) - return StringRef(); - - return F->getFnAttribute(RecipAttrName).getValueAsString(); + return F->getFnAttribute("reciprocal-estimates").getValueAsString(); } /// Construct a string for the given reciprocal operation of the given type. @@ -2097,3 +2178,7 @@ int TargetLoweringBase::getDivRefinementSteps(EVT VT, MachineFunction &MF) const { return getOpRefinementSteps(false, VT, getRecipEstimateForFunc(MF)); } + +void TargetLoweringBase::finalizeLowering(MachineFunction &MF) const { + MF.getRegInfo().freezeReservedRegs(MF); +} diff --git a/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index eb2a28f..6922e33 100644 --- a/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -1,4 +1,4 @@ -//===-- llvm/CodeGen/TargetLoweringObjectFileImpl.cpp - Object File Info --===// +//===- llvm/CodeGen/TargetLoweringObjectFileImpl.cpp - Object File Info ---===// // // The LLVM Compiler Infrastructure // @@ -14,49 +14,109 @@ #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" +#include "llvm/BinaryFormat/COFF.h" +#include "llvm/BinaryFormat/Dwarf.h" +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/BinaryFormat/MachO.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/IR/Comdat.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalObject.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Mangler.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCSectionWasm.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/MCValue.h" +#include "llvm/MC/SectionKind.h" #include "llvm/ProfileData/InstrProf.h" -#include "llvm/Support/COFF.h" -#include "llvm/Support/Dwarf.h" -#include "llvm/Support/ELF.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetSubtargetInfo.h" +#include <cassert> +#include <string> + using namespace llvm; using namespace dwarf; +static void GetObjCImageInfo(Module &M, unsigned &Version, unsigned &Flags, + StringRef &Section) { + SmallVector<Module::ModuleFlagEntry, 8> ModuleFlags; + M.getModuleFlagsMetadata(ModuleFlags); + + for (const auto &MFE: ModuleFlags) { + // Ignore flags with 'Require' behaviour. + if (MFE.Behavior == Module::Require) + continue; + + StringRef Key = MFE.Key->getString(); + if (Key == "Objective-C Image Info Version") { + Version = mdconst::extract<ConstantInt>(MFE.Val)->getZExtValue(); + } else if (Key == "Objective-C Garbage Collection" || + Key == "Objective-C GC Only" || + Key == "Objective-C Is Simulated" || + Key == "Objective-C Class Properties" || + Key == "Objective-C Image Swift Version") { + Flags |= mdconst::extract<ConstantInt>(MFE.Val)->getZExtValue(); + } else if (Key == "Objective-C Image Info Section") { + Section = cast<MDString>(MFE.Val)->getString(); + } + } +} + //===----------------------------------------------------------------------===// // ELF //===----------------------------------------------------------------------===// +void TargetLoweringObjectFileELF::emitModuleMetadata( + MCStreamer &Streamer, Module &M, const TargetMachine &TM) const { + unsigned Version = 0; + unsigned Flags = 0; + StringRef Section; + + GetObjCImageInfo(M, Version, Flags, Section); + if (Section.empty()) + return; + + auto &C = getContext(); + auto *S = C.getELFSection(Section, ELF::SHT_PROGBITS, ELF::SHF_ALLOC); + Streamer.SwitchSection(S); + Streamer.EmitLabel(C.getOrCreateSymbol(StringRef("OBJC_IMAGE_INFO"))); + Streamer.EmitIntValue(Version, 4); + Streamer.EmitIntValue(Flags, 4); + Streamer.AddBlankLine(); +} + MCSymbol *TargetLoweringObjectFileELF::getCFIPersonalitySymbol( const GlobalValue *GV, const TargetMachine &TM, MachineModuleInfo *MMI) const { unsigned Encoding = getPersonalityEncoding(); - if ((Encoding & 0x80) == dwarf::DW_EH_PE_indirect) + if ((Encoding & 0x80) == DW_EH_PE_indirect) return getContext().getOrCreateSymbol(StringRef("DW.ref.") + TM.getSymbol(GV)->getName()); - if ((Encoding & 0x70) == dwarf::DW_EH_PE_absptr) + if ((Encoding & 0x70) == DW_EH_PE_absptr) return TM.getSymbol(GV); report_fatal_error("We do not support this DWARF encoding yet!"); } @@ -86,8 +146,7 @@ void TargetLoweringObjectFileELF::emitPersonalityValue( const MCExpr *TargetLoweringObjectFileELF::getTTypeGlobalReference( const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM, MachineModuleInfo *MMI, MCStreamer &Streamer) const { - - if (Encoding & dwarf::DW_EH_PE_indirect) { + if (Encoding & DW_EH_PE_indirect) { MachineModuleInfoELF &ELFMMI = MMI->getObjFileInfo<MachineModuleInfoELF>(); MCSymbol *SSym = getSymbolWithGlobalValueBase(GV, ".DW.stub", TM); @@ -102,7 +161,7 @@ const MCExpr *TargetLoweringObjectFileELF::getTTypeGlobalReference( return TargetLoweringObjectFile:: getTTypeReference(MCSymbolRefExpr::create(SSym, getContext()), - Encoding & ~dwarf::DW_EH_PE_indirect, Streamer); + Encoding & ~DW_EH_PE_indirect, Streamer); } return TargetLoweringObjectFile::getTTypeGlobalReference(GV, Encoding, TM, @@ -117,8 +176,9 @@ getELFKindForNamedSection(StringRef Name, SectionKind K) { // section(".eh_frame") gcc will produce: // // .section .eh_frame,"a",@progbits - - if (Name == getInstrProfCoverageSectionName(false)) + + if (Name == getInstrProfSectionName(IPSK_covmap, Triple::ELF, + /*AddSegmentInfo=*/false)) return SectionKind::getMetadata(); if (Name.empty() || Name[0] != '.') return K; @@ -149,7 +209,6 @@ getELFKindForNamedSection(StringRef Name, SectionKind K) { return K; } - static unsigned getELFSectionType(StringRef Name, SectionKind K) { // Use SHT_NOTE for section whose name starts with ".note" to allow // emitting ELF notes from C variable declaration. @@ -211,10 +270,47 @@ static const Comdat *getELFComdat(const GlobalValue *GV) { return C; } +static const MCSymbolELF *getAssociatedSymbol(const GlobalObject *GO, + const TargetMachine &TM) { + MDNode *MD = GO->getMetadata(LLVMContext::MD_associated); + if (!MD) + return nullptr; + + const MDOperand &Op = MD->getOperand(0); + if (!Op.get()) + return nullptr; + + auto *VM = dyn_cast<ValueAsMetadata>(Op); + if (!VM) + report_fatal_error("MD_associated operand is not ValueAsMetadata"); + + GlobalObject *OtherGO = dyn_cast<GlobalObject>(VM->getValue()); + return OtherGO ? dyn_cast<MCSymbolELF>(TM.getSymbol(OtherGO)) : nullptr; +} + MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal( const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { StringRef SectionName = GO->getSection(); + // Check if '#pragma clang section' name is applicable. + // Note that pragma directive overrides -ffunction-section, -fdata-section + // and so section name is exactly as user specified and not uniqued. + const GlobalVariable *GV = dyn_cast<GlobalVariable>(GO); + if (GV && GV->hasImplicitSection()) { + auto Attrs = GV->getAttributes(); + if (Attrs.hasAttribute("bss-section") && Kind.isBSS()) { + SectionName = Attrs.getAttribute("bss-section").getValueAsString(); + } else if (Attrs.hasAttribute("rodata-section") && Kind.isReadOnly()) { + SectionName = Attrs.getAttribute("rodata-section").getValueAsString(); + } else if (Attrs.hasAttribute("data-section") && Kind.isData()) { + SectionName = Attrs.getAttribute("data-section").getValueAsString(); + } + } + const Function *F = dyn_cast<Function>(GO); + if (F && F->hasFnAttribute("implicit-section-name")) { + SectionName = F->getFnAttribute("implicit-section-name").getValueAsString(); + } + // Infer section flags from the section name if we can. Kind = getELFKindForNamedSection(SectionName, Kind); @@ -224,9 +320,23 @@ MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal( Group = C->getName(); Flags |= ELF::SHF_GROUP; } - return getContext().getELFSection(SectionName, - getELFSectionType(SectionName, Kind), Flags, - /*EntrySize=*/0, Group); + + // A section can have at most one associated section. Put each global with + // MD_associated in a unique section. + unsigned UniqueID = MCContext::GenericSectionID; + const MCSymbolELF *AssociatedSymbol = getAssociatedSymbol(GO, TM); + if (AssociatedSymbol) { + UniqueID = NextUniqueID++; + Flags |= ELF::SHF_LINK_ORDER; + } + + MCSectionELF *Section = getContext().getELFSection( + SectionName, getELFSectionType(SectionName, Kind), Flags, + /*EntrySize=*/0, Group, UniqueID, AssociatedSymbol); + // Make sure that we did not get some other section with incompatible sh_link. + // This should not be possible due to UniqueID code above. + assert(Section->getAssociatedSymbol() == AssociatedSymbol); + return Section; } /// Return the section prefix name used by options FunctionsSections and @@ -248,11 +358,10 @@ static StringRef getSectionPrefixForGlobal(SectionKind Kind) { return ".data.rel.ro"; } -static MCSectionELF * -selectELFSectionForGlobal(MCContext &Ctx, const GlobalObject *GO, - SectionKind Kind, Mangler &Mang, - const TargetMachine &TM, bool EmitUniqueSection, - unsigned Flags, unsigned *NextUniqueID) { +static MCSectionELF *selectELFSectionForGlobal( + MCContext &Ctx, const GlobalObject *GO, SectionKind Kind, Mangler &Mang, + const TargetMachine &TM, bool EmitUniqueSection, unsigned Flags, + unsigned *NextUniqueID, const MCSymbolELF *AssociatedSymbol) { unsigned EntrySize = 0; if (Kind.isMergeableCString()) { if (Kind.isMergeable2ByteCString()) { @@ -319,7 +428,7 @@ selectELFSectionForGlobal(MCContext &Ctx, const GlobalObject *GO, if (Kind.isExecuteOnly()) UniqueID = 0; return Ctx.getELFSection(Name, getELFSectionType(Name, Kind), Flags, - EntrySize, Group, UniqueID); + EntrySize, Group, UniqueID, AssociatedSymbol); } MCSection *TargetLoweringObjectFileELF::SelectSectionForGlobal( @@ -337,8 +446,17 @@ MCSection *TargetLoweringObjectFileELF::SelectSectionForGlobal( } EmitUniqueSection |= GO->hasComdat(); - return selectELFSectionForGlobal(getContext(), GO, Kind, getMangler(), TM, - EmitUniqueSection, Flags, &NextUniqueID); + const MCSymbolELF *AssociatedSymbol = getAssociatedSymbol(GO, TM); + if (AssociatedSymbol) { + EmitUniqueSection = true; + Flags |= ELF::SHF_LINK_ORDER; + } + + MCSectionELF *Section = selectELFSectionForGlobal( + getContext(), GO, Kind, getMangler(), TM, EmitUniqueSection, Flags, + &NextUniqueID, AssociatedSymbol); + assert(Section->getAssociatedSymbol() == AssociatedSymbol); + return Section; } MCSection *TargetLoweringObjectFileELF::getSectionForJumpTable( @@ -351,8 +469,9 @@ MCSection *TargetLoweringObjectFileELF::getSectionForJumpTable( return ReadOnlySection; return selectELFSectionForGlobal(getContext(), &F, SectionKind::getReadOnly(), - getMangler(), TM, EmitUniqueSection, ELF::SHF_ALLOC, - &NextUniqueID); + getMangler(), TM, EmitUniqueSection, + ELF::SHF_ALLOC, &NextUniqueID, + /* AssociatedSymbol */ nullptr); } bool TargetLoweringObjectFileELF::shouldPutJumpTableInFunctionSection( @@ -500,40 +619,10 @@ void TargetLoweringObjectFileMachO::Initialize(MCContext &Ctx, } } -/// emitModuleFlags - Perform code emission for module flags. -void TargetLoweringObjectFileMachO::emitModuleFlags( - MCStreamer &Streamer, ArrayRef<Module::ModuleFlagEntry> ModuleFlags, - const TargetMachine &TM) const { - unsigned VersionVal = 0; - unsigned ImageInfoFlags = 0; - MDNode *LinkerOptions = nullptr; - StringRef SectionVal; - - for (const auto &MFE : ModuleFlags) { - // Ignore flags with 'Require' behavior. - if (MFE.Behavior == Module::Require) - continue; - - StringRef Key = MFE.Key->getString(); - Metadata *Val = MFE.Val; - - if (Key == "Objective-C Image Info Version") { - VersionVal = mdconst::extract<ConstantInt>(Val)->getZExtValue(); - } else if (Key == "Objective-C Garbage Collection" || - Key == "Objective-C GC Only" || - Key == "Objective-C Is Simulated" || - Key == "Objective-C Class Properties" || - Key == "Objective-C Image Swift Version") { - ImageInfoFlags |= mdconst::extract<ConstantInt>(Val)->getZExtValue(); - } else if (Key == "Objective-C Image Info Section") { - SectionVal = cast<MDString>(Val)->getString(); - } else if (Key == "Linker Options") { - LinkerOptions = cast<MDNode>(Val); - } - } - +void TargetLoweringObjectFileMachO::emitModuleMetadata( + MCStreamer &Streamer, Module &M, const TargetMachine &TM) const { // Emit the linker options if present. - if (LinkerOptions) { + if (auto *LinkerOptions = M.getNamedMetadata("llvm.linker.options")) { for (const auto &Option : LinkerOptions->operands()) { SmallVector<std::string, 4> StrOptions; for (const auto &Piece : cast<MDNode>(Option)->operands()) @@ -542,8 +631,15 @@ void TargetLoweringObjectFileMachO::emitModuleFlags( } } + unsigned VersionVal = 0; + unsigned ImageInfoFlags = 0; + StringRef SectionVal; + + GetObjCImageInfo(M, VersionVal, ImageInfoFlags, SectionVal); + // The section is mandatory. If we don't have it, then we don't have GC info. - if (SectionVal.empty()) return; + if (SectionVal.empty()) + return; StringRef Segment, Section; unsigned TAA = 0, StubSize = 0; @@ -723,7 +819,7 @@ const MCExpr *TargetLoweringObjectFileMachO::getTTypeGlobalReference( return TargetLoweringObjectFile:: getTTypeReference(MCSymbolRefExpr::create(SSym, getContext()), - Encoding & ~dwarf::DW_EH_PE_indirect, Streamer); + Encoding & ~DW_EH_PE_indirect, Streamer); } return TargetLoweringObjectFile::getTTypeGlobalReference(GV, Encoding, TM, @@ -1055,18 +1151,9 @@ MCSection *TargetLoweringObjectFileCOFF::getSectionForJumpTable( COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE, UniqueID); } -void TargetLoweringObjectFileCOFF::emitModuleFlags( - MCStreamer &Streamer, ArrayRef<Module::ModuleFlagEntry> ModuleFlags, - const TargetMachine &TM) const { - MDNode *LinkerOptions = nullptr; - - for (const auto &MFE : ModuleFlags) { - StringRef Key = MFE.Key->getString(); - if (Key == "Linker Options") - LinkerOptions = cast<MDNode>(MFE.Val); - } - - if (LinkerOptions) { +void TargetLoweringObjectFileCOFF::emitModuleMetadata( + MCStreamer &Streamer, Module &M, const TargetMachine &TM) const { + if (NamedMDNode *LinkerOptions = M.getNamedMetadata("llvm.linker.options")) { // Emit the linker options to the linker .drectve section. According to the // spec, this section is a space-separated string containing flags for // linker. @@ -1081,6 +1168,24 @@ void TargetLoweringObjectFileCOFF::emitModuleFlags( } } } + + unsigned Version = 0; + unsigned Flags = 0; + StringRef Section; + + GetObjCImageInfo(M, Version, Flags, Section); + if (Section.empty()) + return; + + auto &C = getContext(); + auto *S = C.getCOFFSection( + Section, COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ, + SectionKind::getReadOnly()); + Streamer.SwitchSection(S); + Streamer.EmitLabel(C.getOrCreateSymbol(StringRef("OBJC_IMAGE_INFO"))); + Streamer.EmitIntValue(Version, 4); + Streamer.EmitIntValue(Flags, 4); + Streamer.AddBlankLine(); } void TargetLoweringObjectFileCOFF::Initialize(MCContext &Ctx, @@ -1122,33 +1227,110 @@ MCSection *TargetLoweringObjectFileCOFF::getStaticDtorSection( void TargetLoweringObjectFileCOFF::emitLinkerFlagsForGlobal( raw_ostream &OS, const GlobalValue *GV) const { - if (!GV->hasDLLExportStorageClass() || GV->isDeclaration()) - return; + emitLinkerFlagsForGlobalCOFF(OS, GV, getTargetTriple(), getMangler()); +} - const Triple &TT = getTargetTriple(); +//===----------------------------------------------------------------------===// +// Wasm +//===----------------------------------------------------------------------===// - if (TT.isKnownWindowsMSVCEnvironment()) - OS << " /EXPORT:"; - else - OS << " -export:"; - - if (TT.isWindowsGNUEnvironment() || TT.isWindowsCygwinEnvironment()) { - std::string Flag; - raw_string_ostream FlagOS(Flag); - getMangler().getNameWithPrefix(FlagOS, GV, false); - FlagOS.flush(); - if (Flag[0] == GV->getParent()->getDataLayout().getGlobalPrefix()) - OS << Flag.substr(1); - else - OS << Flag; - } else { - getMangler().getNameWithPrefix(OS, GV, false); +static const Comdat *getWasmComdat(const GlobalValue *GV) { + const Comdat *C = GV->getComdat(); + if (!C) + return nullptr; + + if (C->getSelectionKind() != Comdat::Any) + report_fatal_error("Wasm COMDATs only support SelectionKind::Any, '" + + C->getName() + "' cannot be lowered."); + + return C; +} + +MCSection *TargetLoweringObjectFileWasm::getExplicitSectionGlobal( + const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { + llvm_unreachable("getExplicitSectionGlobal not yet implemented"); + return nullptr; +} + +static MCSectionWasm * +selectWasmSectionForGlobal(MCContext &Ctx, const GlobalObject *GO, + SectionKind Kind, Mangler &Mang, + const TargetMachine &TM, bool EmitUniqueSection, + unsigned Flags, unsigned *NextUniqueID) { + StringRef Group = ""; + if (getWasmComdat(GO)) + llvm_unreachable("comdat not yet supported for wasm"); + + bool UniqueSectionNames = TM.getUniqueSectionNames(); + SmallString<128> Name = getSectionPrefixForGlobal(Kind); + + if (const auto *F = dyn_cast<Function>(GO)) { + const auto &OptionalPrefix = F->getSectionPrefix(); + if (OptionalPrefix) + Name += *OptionalPrefix; } - if (!GV->getValueType()->isFunctionTy()) { - if (TT.isKnownWindowsMSVCEnvironment()) - OS << ",DATA"; - else - OS << ",data"; + if (EmitUniqueSection && UniqueSectionNames) { + Name.push_back('.'); + TM.getNameWithPrefix(Name, GO, Mang, true); + } + unsigned UniqueID = MCContext::GenericSectionID; + if (EmitUniqueSection && !UniqueSectionNames) { + UniqueID = *NextUniqueID; + (*NextUniqueID)++; } + return Ctx.getWasmSection(Name, /*Type=*/0, Flags, + Group, UniqueID); +} + +MCSection *TargetLoweringObjectFileWasm::SelectSectionForGlobal( + const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { + + if (Kind.isCommon()) + report_fatal_error("mergable sections not supported yet on wasm"); + + // If we have -ffunction-section or -fdata-section then we should emit the + // global value to a uniqued section specifically for it. + bool EmitUniqueSection = false; + if (Kind.isText()) + EmitUniqueSection = TM.getFunctionSections(); + else + EmitUniqueSection = TM.getDataSections(); + EmitUniqueSection |= GO->hasComdat(); + + return selectWasmSectionForGlobal(getContext(), GO, Kind, getMangler(), TM, + EmitUniqueSection, /*Flags=*/0, + &NextUniqueID); +} + +bool TargetLoweringObjectFileWasm::shouldPutJumpTableInFunctionSection( + bool UsesLabelDifference, const Function &F) const { + // We can always create relative relocations, so use another section + // that can be marked non-executable. + return false; +} + +const MCExpr *TargetLoweringObjectFileWasm::lowerRelativeReference( + const GlobalValue *LHS, const GlobalValue *RHS, + const TargetMachine &TM) const { + // We may only use a PLT-relative relocation to refer to unnamed_addr + // functions. + if (!LHS->hasGlobalUnnamedAddr() || !LHS->getValueType()->isFunctionTy()) + return nullptr; + + // Basic sanity checks. + if (LHS->getType()->getPointerAddressSpace() != 0 || + RHS->getType()->getPointerAddressSpace() != 0 || LHS->isThreadLocal() || + RHS->isThreadLocal()) + return nullptr; + + return MCBinaryExpr::createSub( + MCSymbolRefExpr::create(TM.getSymbol(LHS), MCSymbolRefExpr::VK_None, + getContext()), + MCSymbolRefExpr::create(TM.getSymbol(RHS), getContext()), getContext()); +} + +void +TargetLoweringObjectFileWasm::InitializeWasm() { + // TODO: Initialize StaticCtorSection and StaticDtorSection. } diff --git a/contrib/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/contrib/llvm/lib/CodeGen/TargetOptionsImpl.cpp index b6da8e0..ed845e1 100644 --- a/contrib/llvm/lib/CodeGen/TargetOptionsImpl.cpp +++ b/contrib/llvm/lib/CodeGen/TargetOptionsImpl.cpp @@ -11,10 +11,10 @@ // //===----------------------------------------------------------------------===// -#include "llvm/IR/Function.h" -#include "llvm/IR/Module.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetSubtargetInfo.h" @@ -34,14 +34,6 @@ bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const { return false; } -/// LessPreciseFPMAD - This flag return true when -enable-fp-mad option -/// is specified on the command line. When this flag is off(default), the -/// code generator is not allowed to generate mad (multiply add) if the -/// result is "less precise" than doing those operations individually. -bool TargetOptions::LessPreciseFPMAD() const { - return UnsafeFPMath || LessPreciseFPMADOption; -} - /// HonorSignDependentRoundingFPMath - Return true if the codegen must assume /// that the rounding mode of the FPU can change from its default. bool TargetOptions::HonorSignDependentRoundingFPMath() const { diff --git a/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp b/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp index e7ea2b4..817e58c 100644 --- a/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1,4 +1,4 @@ -//===-- TargetPassConfig.cpp - Target independent code generation passes --===// +//===- TargetPassConfig.cpp - Target independent code generation passes ---===// // // The LLVM Compiler Infrastructure // @@ -13,28 +13,37 @@ //===---------------------------------------------------------------------===// #include "llvm/CodeGen/TargetPassConfig.h" - +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/CFLAndersAliasAnalysis.h" #include "llvm/Analysis/CFLSteensAliasAnalysis.h" #include "llvm/Analysis/CallGraphSCCPass.h" -#include "llvm/Analysis/Passes.h" #include "llvm/Analysis/ScopedNoAliasAA.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/TypeBasedAliasAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachinePassRegistry.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegAllocRegistry.h" -#include "llvm/CodeGen/RegisterUsageInfo.h" #include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Verifier.h" #include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCTargetOptions.h" +#include "llvm/Pass.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Threading.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/SymbolRewriter.h" +#include <cassert> +#include <string> using namespace llvm; @@ -92,6 +101,19 @@ static cl::opt<bool> VerifyMachineCode("verify-machineinstrs", cl::Hidden, cl::desc("Verify generated machine code"), cl::init(false), cl::ZeroOrMore); +static cl::opt<bool> EnableMachineOutliner("enable-machine-outliner", + cl::Hidden, + cl::desc("Enable machine outliner")); +// Enable or disable FastISel. Both options are needed, because +// FastISel is enabled by default with -fast, and we wish to be +// able to enable or disable fast-isel independently from -O0. +static cl::opt<cl::boolOrDefault> +EnableFastISelOption("fast-isel", cl::Hidden, + cl::desc("Enable the \"fast\" instruction selector")); + +static cl::opt<cl::boolOrDefault> + EnableGlobalISel("global-isel", cl::Hidden, + cl::desc("Enable the \"global\" instruction selector")); static cl::opt<std::string> PrintMachineInstrs("print-machineinstrs", cl::ValueOptional, @@ -211,6 +233,7 @@ char TargetPassConfig::EarlyTailDuplicateID = 0; char TargetPassConfig::PostRAMachineLICMID = 0; namespace { + struct InsertedPass { AnalysisID TargetPassID; IdentifyingPassPtr InsertedPassID; @@ -231,9 +254,11 @@ struct InsertedPass { return NP; } }; -} + +} // end anonymous namespace namespace llvm { + class PassConfigImpl { public: // List of passes explicitly substituted by this target. Normally this is @@ -249,7 +274,8 @@ public: /// is inserted after each instance of the first one. SmallVector<InsertedPass, 4> InsertedPasses; }; -} // namespace llvm + +} // end namespace llvm // Out of line virtual method. TargetPassConfig::~TargetPassConfig() { @@ -258,11 +284,8 @@ TargetPassConfig::~TargetPassConfig() { // Out of line constructor provides default values for pass options and // registers all common codegen passes. -TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm) - : ImmutablePass(ID), PM(&pm), Started(true), Stopped(false), - AddingMachinePasses(false), TM(tm), Impl(nullptr), Initialized(false), - DisableVerify(false), EnableTailMerge(true) { - +TargetPassConfig::TargetPassConfig(LLVMTargetMachine &TM, PassManagerBase &pm) + : ImmutablePass(ID), PM(&pm), TM(&TM) { Impl = new PassConfigImpl(); // Register all target independent codegen passes to activate their PassIDs, @@ -278,7 +301,10 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm) substitutePass(&PostRAMachineLICMID, &MachineLICMID); if (StringRef(PrintMachineInstrs.getValue()).equals("")) - TM->Options.PrintMachineCode = true; + TM.Options.PrintMachineCode = true; + + if (TM.Options.EnableIPRA) + setRequiresCodeGenSCCOrder(); } CodeGenOpt::Level TargetPassConfig::getOptLevel() const { @@ -303,12 +329,14 @@ void TargetPassConfig::insertPass(AnalysisID TargetPassID, /// /// Targets may override this to extend TargetPassConfig. TargetPassConfig *LLVMTargetMachine::createPassConfig(PassManagerBase &PM) { - return new TargetPassConfig(this, PM); + return new TargetPassConfig(*this, PM); } TargetPassConfig::TargetPassConfig() - : ImmutablePass(ID), PM(nullptr) { - llvm_unreachable("TargetPassConfig should not be constructed on-the-fly"); + : ImmutablePass(ID) { + report_fatal_error("Trying to construct TargetPassConfig without a target " + "machine. Scheduling a CodeGen pass without a target " + "triple set?"); } // Helper to verify the analysis is really immutable. @@ -421,7 +449,12 @@ void TargetPassConfig::addPrintPass(const std::string &Banner) { } void TargetPassConfig::addVerifyPass(const std::string &Banner) { - if (VerifyMachineCode) + bool Verify = VerifyMachineCode; +#ifdef EXPENSIVE_CHECKS + if (VerifyMachineCode == cl::BOU_UNSET) + Verify = TM->isMachineVerifierClean(); +#endif + if (Verify) PM->add(createMachineVerifierPass(Banner)); } @@ -480,6 +513,14 @@ void TargetPassConfig::addIRPasses() { // Insert calls to mcount-like functions. addPass(createCountingFunctionInserterPass()); + + // Add scalarization of target's unsupported masked memory intrinsics pass. + // the unsupported intrinsic will be replaced with a chain of basic blocks, + // that stores/loads element one-by-one if the appropriate mask bit is set. + addPass(createScalarizeMaskedMemIntrinPass()); + + // Expand reduction intrinsics into shuffle sequences if the target wants to. + addPass(createExpandReductionsPass()); } /// Turn exception handling constructs into something the code generators can @@ -499,14 +540,14 @@ void TargetPassConfig::addPassesToHandleExceptions() { LLVM_FALLTHROUGH; case ExceptionHandling::DwarfCFI: case ExceptionHandling::ARM: - addPass(createDwarfEHPass(TM)); + addPass(createDwarfEHPass()); break; case ExceptionHandling::WinEH: // We support using both GCC-style and MSVC-style exceptions on Windows, so // add both preparation passes. Each pass will only actually run if it // recognizes the personality function. - addPass(createWinEHPass(TM)); - addPass(createDwarfEHPass(TM)); + addPass(createWinEHPass()); + addPass(createDwarfEHPass()); break; case ExceptionHandling::None: addPass(createLowerInvokePass()); @@ -521,7 +562,7 @@ void TargetPassConfig::addPassesToHandleExceptions() { /// before exception handling preparation passes. void TargetPassConfig::addCodeGenPrepare() { if (getOptLevel() != CodeGenOpt::None && !DisableCGP) - addPass(createCodeGenPreparePass(TM)); + addPass(createCodeGenPreparePass()); addPass(createRewriteSymbolsPass()); } @@ -531,13 +572,13 @@ void TargetPassConfig::addISelPrepare() { addPreISel(); // Force codegen to run according to the callgraph. - if (TM->Options.EnableIPRA) + if (requiresCodeGenSCCOrder()) addPass(new DummyCGSCCPass); // Add both the safe stack and the stack protection passes: each of them will // only protect functions that have corresponding attributes. - addPass(createSafeStackPass(TM)); - addPass(createStackProtectorPass(TM)); + addPass(createSafeStackPass()); + addPass(createStackProtectorPass()); if (PrintISelInput) addPass(createPrintFunctionPass( @@ -549,6 +590,74 @@ void TargetPassConfig::addISelPrepare() { addPass(createVerifierPass()); } +bool TargetPassConfig::addCoreISelPasses() { + // Enable FastISel with -fast, but allow that to be overridden. + TM->setO0WantsFastISel(EnableFastISelOption != cl::BOU_FALSE); + if (EnableFastISelOption == cl::BOU_TRUE || + (TM->getOptLevel() == CodeGenOpt::None && TM->getO0WantsFastISel())) + TM->setFastISel(true); + + // Ask the target for an isel. + // Enable GlobalISel if the target wants to, but allow that to be overriden. + if (EnableGlobalISel == cl::BOU_TRUE || + (EnableGlobalISel == cl::BOU_UNSET && isGlobalISelEnabled())) { + if (addIRTranslator()) + return true; + + addPreLegalizeMachineIR(); + + if (addLegalizeMachineIR()) + return true; + + // Before running the register bank selector, ask the target if it + // wants to run some passes. + addPreRegBankSelect(); + + if (addRegBankSelect()) + return true; + + addPreGlobalInstructionSelect(); + + if (addGlobalInstructionSelect()) + return true; + + // Pass to reset the MachineFunction if the ISel failed. + addPass(createResetMachineFunctionPass( + reportDiagnosticWhenGlobalISelFallback(), isGlobalISelAbortEnabled())); + + // Provide a fallback path when we do not want to abort on + // not-yet-supported input. + if (!isGlobalISelAbortEnabled() && addInstSelector()) + return true; + + } else if (addInstSelector()) + return true; + + return false; +} + +bool TargetPassConfig::addISelPasses() { + if (TM->Options.EmulatedTLS) + addPass(createLowerEmuTLSPass()); + + addPass(createPreISelIntrinsicLoweringPass()); + addPass(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis())); + addIRPasses(); + addCodeGenPrepare(); + addPassesToHandleExceptions(); + addISelPrepare(); + + return addCoreISelPasses(); +} + +/// -regalloc=... command line option. +static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } +static cl::opt<RegisterRegAlloc::FunctionPassCtor, false, + RegisterPassParser<RegisterRegAlloc> > +RegAlloc("regalloc", + cl::init(&useDefaultRegisterAllocator), + cl::desc("Register allocator to use")); + /// Add the complete set of target-independent postISel code generator passes. /// /// This can be read as the standard order of major LLVM CodeGen stages. Stages @@ -607,8 +716,12 @@ void TargetPassConfig::addMachinePasses() { // including phi elimination and scheduling. if (getOptimizeRegAlloc()) addOptimizedRegAlloc(createRegAllocPass(true)); - else + else { + if (RegAlloc != &useDefaultRegisterAllocator && + RegAlloc != &createFastRegisterAllocator) + report_fatal_error("Must use fast (default) register allocator for unoptimized regalloc."); addFastRegAlloc(createRegAllocPass(false)); + } // Run post-ra passes. addPostRegAlloc(); @@ -620,7 +733,7 @@ void TargetPassConfig::addMachinePasses() { // Prolog/Epilog inserter needs a TargetMachine to instantiate. But only // do so if it hasn't been disabled, substituted, or overridden. if (!isPassSubstitutedOrOverridden(&PrologEpilogCodeInserterID)) - addPass(createPrologEpilogInserterPass(TM)); + addPass(createPrologEpilogInserterPass()); /// Add passes that optimize machine instructions after register allocation. if (getOptLevel() != CodeGenOpt::None) @@ -668,9 +781,15 @@ void TargetPassConfig::addMachinePasses() { addPass(&StackMapLivenessID, false); addPass(&LiveDebugValuesID, false); + // Insert before XRay Instrumentation. + addPass(&FEntryInserterID, false); + addPass(&XRayInstrumentationID, false); addPass(&PatchableFunctionID, false); + if (EnableMachineOutliner) + PM->add(createMachineOutlinerPass()); + AddingMachinePasses = false; } @@ -704,6 +823,10 @@ void TargetPassConfig::addMachineSSAOptimization() { addPass(&MachineLICMID, false); addPass(&MachineCSEID, false); + + // Coalesce basic blocks with the same branch condition + addPass(&BranchCoalescingID); + addPass(&MachineSinkingID); addPass(&PeepholeOptimizerID); @@ -730,20 +853,13 @@ MachinePassRegistry RegisterRegAlloc::Registry; /// A dummy default pass factory indicates whether the register allocator is /// overridden on the command line. -LLVM_DEFINE_ONCE_FLAG(InitializeDefaultRegisterAllocatorFlag); -static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } +static llvm::once_flag InitializeDefaultRegisterAllocatorFlag; + static RegisterRegAlloc defaultRegAlloc("default", "pick register allocator based on -O option", useDefaultRegisterAllocator); -/// -regalloc=... command line option. -static cl::opt<RegisterRegAlloc::FunctionPassCtor, false, - RegisterPassParser<RegisterRegAlloc> > -RegAlloc("regalloc", - cl::init(&useDefaultRegisterAllocator), - cl::desc("Register allocator to use")); - static void initializeDefaultRegisterAllocatorOnce() { RegisterRegAlloc::FunctionPassCtor Ctor = RegisterRegAlloc::getDefault(); @@ -753,7 +869,6 @@ static void initializeDefaultRegisterAllocatorOnce() { } } - /// Instantiate the default register allocator pass for this target for either /// the optimized or unoptimized allocation path. This will be added to the pass /// manager by addFastRegAlloc in the unoptimized case or addOptimizedRegAlloc @@ -903,6 +1018,11 @@ void TargetPassConfig::addBlockPlacement() { //===---------------------------------------------------------------------===// /// GlobalISel Configuration //===---------------------------------------------------------------------===// + +bool TargetPassConfig::isGlobalISelEnabled() const { + return false; +} + bool TargetPassConfig::isGlobalISelAbortEnabled() const { return EnableGlobalISelAbort == 1; } diff --git a/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp index cd50c5b..eeb00a7 100644 --- a/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -1,4 +1,4 @@ -//===- TargetRegisterInfo.cpp - Target Register Information Implementation ===// +//==- TargetRegisterInfo.cpp - Target Register Information Implementation --==// // // The LLVM Compiler Infrastructure // @@ -11,17 +11,27 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/Format.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/Printable.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <cassert> +#include <utility> #define DEBUG_TYPE "target-reg-info" @@ -38,7 +48,7 @@ TargetRegisterInfo::TargetRegisterInfo(const TargetRegisterInfoDesc *ID, CoveringLanes(SRICoveringLanes) { } -TargetRegisterInfo::~TargetRegisterInfo() {} +TargetRegisterInfo::~TargetRegisterInfo() = default; void TargetRegisterInfo::markSuperRegs(BitVector &RegisterSet, unsigned Reg) const { @@ -50,8 +60,7 @@ bool TargetRegisterInfo::checkAllSuperRegsMarked(const BitVector &RegisterSet, ArrayRef<MCPhysReg> Exceptions) const { // Check that all super registers of reserved regs are reserved as well. BitVector Checked(getNumRegs()); - for (int Reg = RegisterSet.find_first(); Reg>=0; - Reg = RegisterSet.find_next(Reg)) { + for (unsigned Reg : RegisterSet.set_bits()) { if (Checked[Reg]) continue; for (MCSuperRegIterator SR(Reg, this); SR.isValid(); ++SR) { @@ -127,7 +136,7 @@ Printable PrintVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) { }); } -} // End of llvm namespace +} // end namespace llvm /// getAllocatableClass - Return the maximal subclass of the given register /// class that is alloctable, or NULL. @@ -155,10 +164,9 @@ TargetRegisterInfo::getMinimalPhysRegClass(unsigned reg, MVT VT) const { // Pick the most sub register class of the right type that contains // this physreg. const TargetRegisterClass* BestRC = nullptr; - for (regclass_iterator I = regclass_begin(), E = regclass_end(); I != E; ++I){ - const TargetRegisterClass* RC = *I; - if ((VT == MVT::Other || RC->hasType(VT)) && RC->contains(reg) && - (!BestRC || BestRC->hasSubClass(RC))) + for (const TargetRegisterClass* RC : regclasses()) { + if ((VT == MVT::Other || isTypeLegalForClass(*RC, VT)) && + RC->contains(reg) && (!BestRC || BestRC->hasSubClass(RC))) BestRC = RC; } @@ -185,10 +193,9 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF, if (SubClass) getAllocatableSetForRC(MF, SubClass, Allocatable); } else { - for (TargetRegisterInfo::regclass_iterator I = regclass_begin(), - E = regclass_end(); I != E; ++I) - if ((*I)->isAllocatable()) - getAllocatableSetForRC(MF, *I, Allocatable); + for (const TargetRegisterClass *C : regclasses()) + if (C->isAllocatable()) + getAllocatableSetForRC(MF, C, Allocatable); } // Mask out the reserved registers @@ -209,7 +216,7 @@ const TargetRegisterClass *firstCommonClass(const uint32_t *A, if (unsigned Common = *A++ & *B++) { const TargetRegisterClass *RC = TRI->getRegClass(I + countTrailingZeros(Common)); - if (SVT == MVT::SimpleValueType::Any || RC->hasType(VT)) + if (SVT == MVT::SimpleValueType::Any || TRI->isTypeLegalForClass(*RC, VT)) return RC; } return nullptr; @@ -267,7 +274,7 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA, const TargetRegisterClass *BestRC = nullptr; unsigned *BestPreA = &PreA; unsigned *BestPreB = &PreB; - if (RCA->getSize() < RCB->getSize()) { + if (getRegSizeInBits(*RCA) < getRegSizeInBits(*RCB)) { std::swap(RCA, RCB); std::swap(SubA, SubB); std::swap(BestPreA, BestPreB); @@ -275,7 +282,7 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA, // Also terminate the search one we have found a register class as small as // RCA. - unsigned MinSize = RCA->getSize(); + unsigned MinSize = getRegSizeInBits(*RCA); for (SuperRegClassIterator IA(RCA, this, true); IA.isValid(); ++IA) { unsigned FinalA = composeSubRegIndices(IA.getSubReg(), SubA); @@ -283,7 +290,7 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA, // Check if a common super-register class exists for this index pair. const TargetRegisterClass *RC = firstCommonClass(IA.getMask(), IB.getMask(), this); - if (!RC || RC->getSize() < MinSize) + if (!RC || getRegSizeInBits(*RC) < MinSize) continue; // The indexes must compose identically: PreA+SubA == PreB+SubB. @@ -292,7 +299,7 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA, continue; // Is RC a better candidate than BestRC? - if (BestRC && RC->getSize() >= BestRC->getSize()) + if (BestRC && getRegSizeInBits(*RC) >= getRegSizeInBits(*BestRC)) continue; // Yes, RC is the smallest super-register seen so far. @@ -301,7 +308,7 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA, *BestPreB = IB.getSubReg(); // Bail early if we reached MinSize. We won't find a better candidate. - if (BestRC->getSize() == MinSize) + if (getRegSizeInBits(*BestRC) == MinSize) return BestRC; } } @@ -415,9 +422,9 @@ bool TargetRegisterInfo::regmaskSubsetEqual(const uint32_t *mask0, } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void -TargetRegisterInfo::dumpReg(unsigned Reg, unsigned SubRegIndex, - const TargetRegisterInfo *TRI) { +LLVM_DUMP_METHOD +void TargetRegisterInfo::dumpReg(unsigned Reg, unsigned SubRegIndex, + const TargetRegisterInfo *TRI) { dbgs() << PrintReg(Reg, TRI, SubRegIndex) << "\n"; } #endif diff --git a/contrib/llvm/lib/CodeGen/TargetSchedule.cpp b/contrib/llvm/lib/CodeGen/TargetSchedule.cpp index 83e52d3..9210ea8 100644 --- a/contrib/llvm/lib/CodeGen/TargetSchedule.cpp +++ b/contrib/llvm/lib/CodeGen/TargetSchedule.cpp @@ -1,4 +1,4 @@ -//===-- llvm/Target/TargetSchedule.cpp - Sched Machine Model ----*- C++ -*-===// +//===- llvm/Target/TargetSchedule.cpp - Sched Machine Model ---------------===// // // The LLVM Compiler Infrastructure // @@ -13,11 +13,21 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCInstrItineraries.h" +#include "llvm/MC/MCSchedule.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> using namespace llvm; @@ -37,13 +47,14 @@ bool TargetSchedModel::hasInstrItineraries() const { static unsigned gcd(unsigned Dividend, unsigned Divisor) { // Dividend and Divisor will be naturally swapped as needed. - while(Divisor) { + while (Divisor) { unsigned Rem = Dividend % Divisor; Dividend = Divisor; Divisor = Rem; }; return Dividend; } + static unsigned lcm(unsigned A, unsigned B) { unsigned LCM = (uint64_t(A) * B) / gcd(A, B); assert((LCM >= A && LCM >= B) && "LCM overflow"); @@ -73,6 +84,29 @@ void TargetSchedModel::init(const MCSchedModel &sm, } } +/// Returns true only if instruction is specified as single issue. +bool TargetSchedModel::mustBeginGroup(const MachineInstr *MI, + const MCSchedClassDesc *SC) const { + if (hasInstrSchedModel()) { + if (!SC) + SC = resolveSchedClass(MI); + if (SC->isValid()) + return SC->BeginGroup; + } + return false; +} + +bool TargetSchedModel::mustEndGroup(const MachineInstr *MI, + const MCSchedClassDesc *SC) const { + if (hasInstrSchedModel()) { + if (!SC) + SC = resolveSchedClass(MI); + if (SC->isValid()) + return SC->EndGroup; + } + return false; +} + unsigned TargetSchedModel::getNumMicroOps(const MachineInstr *MI, const MCSchedClassDesc *SC) const { if (hasInstrItineraries()) { @@ -100,7 +134,6 @@ static unsigned capLatency(int Cycles) { /// evaluation of predicates that depend on instruction operands or flags. const MCSchedClassDesc *TargetSchedModel:: resolveSchedClass(const MachineInstr *MI) const { - // Get the definition's scheduling class descriptor from this machine model. unsigned SchedClass = MI->getDesc().getSchedClass(); const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SchedClass); @@ -244,7 +277,11 @@ unsigned TargetSchedModel::computeInstrLatency(unsigned Opcode) const { if (SCDesc->isValid() && !SCDesc->isVariant()) return computeInstrLatency(*SCDesc); - llvm_unreachable("No MI sched latency"); + if (SCDesc->isValid()) { + assert (!SCDesc->isVariant() && "No MI sched latency: SCDesc->isVariant()"); + return computeInstrLatency(*SCDesc); + } + return 0; } unsigned @@ -298,3 +335,68 @@ computeOutputLatency(const MachineInstr *DefMI, unsigned DefOperIdx, } return 0; } + +static Optional<double> +getRThroughputFromItineraries(unsigned schedClass, + const InstrItineraryData *IID){ + double Unknown = std::numeric_limits<double>::infinity(); + double Throughput = Unknown; + + for (const InstrStage *IS = IID->beginStage(schedClass), + *E = IID->endStage(schedClass); + IS != E; ++IS) { + unsigned Cycles = IS->getCycles(); + if (!Cycles) + continue; + Throughput = + std::min(Throughput, countPopulation(IS->getUnits()) * 1.0 / Cycles); + } + // We need reciprocal throughput that's why we return such value. + return 1 / Throughput; +} + +static Optional<double> +getRThroughputFromInstrSchedModel(const MCSchedClassDesc *SCDesc, + const TargetSubtargetInfo *STI, + const MCSchedModel &SchedModel) { + double Unknown = std::numeric_limits<double>::infinity(); + double Throughput = Unknown; + + for (const MCWriteProcResEntry *WPR = STI->getWriteProcResBegin(SCDesc), + *WEnd = STI->getWriteProcResEnd(SCDesc); + WPR != WEnd; ++WPR) { + unsigned Cycles = WPR->Cycles; + if (!Cycles) + return Optional<double>(); + + unsigned NumUnits = + SchedModel.getProcResource(WPR->ProcResourceIdx)->NumUnits; + Throughput = std::min(Throughput, NumUnits * 1.0 / Cycles); + } + // We need reciprocal throughput that's why we return such value. + return 1 / Throughput; +} + +Optional<double> +TargetSchedModel::computeInstrRThroughput(const MachineInstr *MI) const { + if (hasInstrItineraries()) + return getRThroughputFromItineraries(MI->getDesc().getSchedClass(), + getInstrItineraries()); + if (hasInstrSchedModel()) + return getRThroughputFromInstrSchedModel(resolveSchedClass(MI), STI, + SchedModel); + return Optional<double>(); +} + +Optional<double> +TargetSchedModel::computeInstrRThroughput(unsigned Opcode) const { + unsigned SchedClass = TII->get(Opcode).getSchedClass(); + if (hasInstrItineraries()) + return getRThroughputFromItineraries(SchedClass, getInstrItineraries()); + if (hasInstrSchedModel()) { + const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SchedClass); + if (SCDesc->isValid() && !SCDesc->isVariant()) + return getRThroughputFromInstrSchedModel(SCDesc, STI, SchedModel); + } + return Optional<double>(); +} diff --git a/contrib/llvm/lib/CodeGen/TargetSubtargetInfo.cpp b/contrib/llvm/lib/CodeGen/TargetSubtargetInfo.cpp index c74707d..f6d5bc8 100644 --- a/contrib/llvm/lib/CodeGen/TargetSubtargetInfo.cpp +++ b/contrib/llvm/lib/CodeGen/TargetSubtargetInfo.cpp @@ -1,4 +1,4 @@ -//===-- TargetSubtargetInfo.cpp - General Target Information ---------------==// +//===- TargetSubtargetInfo.cpp - General Target Information ----------------==// // // The LLVM Compiler Infrastructure // @@ -11,12 +11,17 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/Optional.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/MC/MCInst.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" +#include <string> + using namespace llvm; -//--------------------------------------------------------------------------- -// TargetSubtargetInfo Class -// TargetSubtargetInfo::TargetSubtargetInfo( const Triple &TT, StringRef CPU, StringRef FS, ArrayRef<SubtargetFeatureKV> PF, ArrayRef<SubtargetFeatureKV> PD, @@ -26,7 +31,7 @@ TargetSubtargetInfo::TargetSubtargetInfo( : MCSubtargetInfo(TT, CPU, FS, PF, PD, ProcSched, WPR, WL, RA, IS, OC, FP) { } -TargetSubtargetInfo::~TargetSubtargetInfo() {} +TargetSubtargetInfo::~TargetSubtargetInfo() = default; bool TargetSubtargetInfo::enableAtomicExpand() const { return true; @@ -52,3 +57,46 @@ bool TargetSubtargetInfo::enablePostRAScheduler() const { bool TargetSubtargetInfo::useAA() const { return false; } + +static std::string createSchedInfoStr(unsigned Latency, + Optional<double> RThroughput) { + static const char *SchedPrefix = " sched: ["; + std::string Comment; + raw_string_ostream CS(Comment); + if (Latency > 0 && RThroughput.hasValue()) + CS << SchedPrefix << Latency << format(":%2.2f", RThroughput.getValue()) + << "]"; + else if (Latency > 0) + CS << SchedPrefix << Latency << ":?]"; + else if (RThroughput.hasValue()) + CS << SchedPrefix << "?:" << RThroughput.getValue() << "]"; + CS.flush(); + return Comment; +} + +/// Returns string representation of scheduler comment +std::string TargetSubtargetInfo::getSchedInfoStr(const MachineInstr &MI) const { + if (MI.isPseudo() || MI.isTerminator()) + return std::string(); + // We don't cache TSchedModel because it depends on TargetInstrInfo + // that could be changed during the compilation + TargetSchedModel TSchedModel; + TSchedModel.init(getSchedModel(), this, getInstrInfo()); + unsigned Latency = TSchedModel.computeInstrLatency(&MI); + Optional<double> RThroughput = TSchedModel.computeInstrRThroughput(&MI); + return createSchedInfoStr(Latency, RThroughput); +} + +/// Returns string representation of scheduler comment +std::string TargetSubtargetInfo::getSchedInfoStr(MCInst const &MCI) const { + // We don't cache TSchedModel because it depends on TargetInstrInfo + // that could be changed during the compilation + TargetSchedModel TSchedModel; + TSchedModel.init(getSchedModel(), this, getInstrInfo()); + if (!TSchedModel.hasInstrSchedModel()) + return std::string(); + unsigned Latency = TSchedModel.computeInstrLatency(MCI.getOpcode()); + Optional<double> RThroughput = + TSchedModel.computeInstrRThroughput(MCI.getOpcode()); + return createSchedInfoStr(Latency, RThroughput); +} diff --git a/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp index 0f1b2ed..83c00e2 100644 --- a/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -52,7 +52,7 @@ using namespace llvm; -#define DEBUG_TYPE "twoaddrinstr" +#define DEBUG_TYPE "twoaddressinstruction" STATISTIC(NumTwoAddressInstrs, "Number of two-address instructions"); STATISTIC(NumCommuted , "Number of instructions commuted to coalesce"); @@ -68,6 +68,13 @@ EnableRescheduling("twoaddr-reschedule", cl::desc("Coalesce copies by rescheduling (default=true)"), cl::init(true), cl::Hidden); +// Limit the number of dataflow edges to traverse when evaluating the benefit +// of commuting operands. +static cl::opt<unsigned> MaxDataFlowEdge( + "dataflow-edge-limit", cl::Hidden, cl::init(3), + cl::desc("Maximum number of dataflow edges to traverse when evaluating " + "the benefit of commuting operands")); + namespace { class TwoAddressInstructionPass : public MachineFunctionPass { MachineFunction *MF; @@ -155,7 +162,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<AAResultsWrapperPass>(); + AU.addUsedIfAvailable<AAResultsWrapperPass>(); AU.addUsedIfAvailable<LiveVariables>(); AU.addPreserved<LiveVariables>(); AU.addPreserved<SlotIndexes>(); @@ -171,10 +178,10 @@ public: } // end anonymous namespace char TwoAddressInstructionPass::ID = 0; -INITIALIZE_PASS_BEGIN(TwoAddressInstructionPass, "twoaddressinstruction", +INITIALIZE_PASS_BEGIN(TwoAddressInstructionPass, DEBUG_TYPE, "Two-Address instruction pass", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(TwoAddressInstructionPass, "twoaddressinstruction", +INITIALIZE_PASS_END(TwoAddressInstructionPass, DEBUG_TYPE, "Two-Address instruction pass", false, false) char &llvm::TwoAddressInstructionPassID = TwoAddressInstructionPass::ID; @@ -637,10 +644,10 @@ isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC, // To more generally minimize register copies, ideally the logic of two addr // instruction pass should be integrated with register allocation pass where // interference graph is available. - if (isRevCopyChain(regC, regA, 3)) + if (isRevCopyChain(regC, regA, MaxDataFlowEdge)) return true; - if (isRevCopyChain(regB, regA, 3)) + if (isRevCopyChain(regB, regA, MaxDataFlowEdge)) return false; // Since there are no intervening uses for both registers, then commute @@ -905,7 +912,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi, ++End; } - // Check if the reschedule will not break depedencies. + // Check if the reschedule will not break dependencies. unsigned NumVisited = 0; MachineBasicBlock::iterator KillPos = KillMI; ++KillPos; @@ -1627,7 +1634,10 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) { InstrItins = MF->getSubtarget().getInstrItineraryData(); LV = getAnalysisIfAvailable<LiveVariables>(); LIS = getAnalysisIfAvailable<LiveIntervals>(); - AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); + if (auto *AAPass = getAnalysisIfAvailable<AAResultsWrapperPass>()) + AA = &AAPass->getAAResults(); + else + AA = nullptr; OptLevel = TM.getOptLevel(); bool MadeChange = false; @@ -1785,7 +1795,7 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) { MachineInstr *CopyMI = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY)) .addReg(DstReg, RegState::Define, SubIdx) - .addOperand(UseMO); + .add(UseMO); // The first def needs an <undef> flag because there is no live register // before it. diff --git a/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp b/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp index c2db56a..407fd9b 100644 --- a/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp +++ b/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp @@ -25,6 +25,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -195,18 +196,31 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) { } if (phi->getNumOperands() == 3) { - unsigned Input = phi->getOperand(1).getReg(); - unsigned Output = phi->getOperand(0).getReg(); - - phi++->eraseFromParent(); + const MachineOperand &Input = phi->getOperand(1); + const MachineOperand &Output = phi->getOperand(0); + unsigned InputReg = Input.getReg(); + unsigned OutputReg = Output.getReg(); + assert(Output.getSubReg() == 0 && "Cannot have output subregister"); ModifiedPHI = true; - if (Input != Output) { + if (InputReg != OutputReg) { MachineRegisterInfo &MRI = F.getRegInfo(); - MRI.constrainRegClass(Input, MRI.getRegClass(Output)); - MRI.replaceRegWith(Output, Input); + unsigned InputSub = Input.getSubReg(); + if (InputSub == 0 && + MRI.constrainRegClass(InputReg, MRI.getRegClass(OutputReg))) { + MRI.replaceRegWith(OutputReg, InputReg); + } else { + // The input register to the PHI has a subregister or it can't be + // constrained to the proper register class: + // insert a COPY instead of simply replacing the output + // with the input. + const TargetInstrInfo *TII = F.getSubtarget().getInstrInfo(); + BuildMI(*BB, BB->getFirstNonPHI(), phi->getDebugLoc(), + TII->get(TargetOpcode::COPY), OutputReg) + .addReg(InputReg, getRegState(Input), InputSub); + } + phi++->eraseFromParent(); } - continue; } diff --git a/contrib/llvm/lib/CodeGen/VirtRegMap.cpp b/contrib/llvm/lib/CodeGen/VirtRegMap.cpp index 0d506d6..f8aacdb 100644 --- a/contrib/llvm/lib/CodeGen/VirtRegMap.cpp +++ b/contrib/llvm/lib/CodeGen/VirtRegMap.cpp @@ -72,9 +72,21 @@ void VirtRegMap::grow() { Virt2SplitMap.resize(NumRegs); } +void VirtRegMap::assignVirt2Phys(unsigned virtReg, MCPhysReg physReg) { + assert(TargetRegisterInfo::isVirtualRegister(virtReg) && + TargetRegisterInfo::isPhysicalRegister(physReg)); + assert(Virt2PhysMap[virtReg] == NO_PHYS_REG && + "attempt to assign physical register to already mapped " + "virtual register"); + assert(!getRegInfo().isReserved(physReg) && + "Attempt to map virtReg to a reserved physReg"); + Virt2PhysMap[virtReg] = physReg; +} + unsigned VirtRegMap::createSpillSlot(const TargetRegisterClass *RC) { - int SS = MF->getFrameInfo().CreateSpillStackObject(RC->getSize(), - RC->getAlignment()); + unsigned Size = TRI->getSpillSize(*RC); + unsigned Align = TRI->getSpillAlignment(*RC); + int SS = MF->getFrameInfo().CreateSpillStackObject(Size, Align); ++NumSpillSlots; return SS; } @@ -167,6 +179,8 @@ class VirtRegRewriter : public MachineFunctionPass { bool readsUndefSubreg(const MachineOperand &MO) const; void addLiveInsForSubRanges(const LiveInterval &LI, unsigned PhysReg) const; void handleIdentityCopy(MachineInstr &MI) const; + void expandCopyBundle(MachineInstr &MI) const; + bool subRegLiveThrough(const MachineInstr &MI, unsigned SuperPhysReg) const; public: static char ID; @@ -367,11 +381,67 @@ void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) const { } if (Indexes) - Indexes->removeMachineInstrFromMaps(MI); - MI.eraseFromParent(); + Indexes->removeSingleMachineInstrFromMaps(MI); + MI.eraseFromBundle(); DEBUG(dbgs() << " deleted.\n"); } +/// The liverange splitting logic sometimes produces bundles of copies when +/// subregisters are involved. Expand these into a sequence of copy instructions +/// after processing the last in the bundle. Does not update LiveIntervals +/// which we shouldn't need for this instruction anymore. +void VirtRegRewriter::expandCopyBundle(MachineInstr &MI) const { + if (!MI.isCopy()) + return; + + if (MI.isBundledWithPred() && !MI.isBundledWithSucc()) { + // Only do this when the complete bundle is made out of COPYs. + MachineBasicBlock &MBB = *MI.getParent(); + for (MachineBasicBlock::reverse_instr_iterator I = + std::next(MI.getReverseIterator()), E = MBB.instr_rend(); + I != E && I->isBundledWithSucc(); ++I) { + if (!I->isCopy()) + return; + } + + for (MachineBasicBlock::reverse_instr_iterator I = MI.getReverseIterator(); + I->isBundledWithPred(); ) { + MachineInstr &MI = *I; + ++I; + + MI.unbundleFromPred(); + if (Indexes) + Indexes->insertMachineInstrInMaps(MI); + } + } +} + +/// Check whether (part of) \p SuperPhysReg is live through \p MI. +/// \pre \p MI defines a subregister of a virtual register that +/// has been assigned to \p SuperPhysReg. +bool VirtRegRewriter::subRegLiveThrough(const MachineInstr &MI, + unsigned SuperPhysReg) const { + SlotIndex MIIndex = LIS->getInstructionIndex(MI); + SlotIndex BeforeMIUses = MIIndex.getBaseIndex(); + SlotIndex AfterMIDefs = MIIndex.getBoundaryIndex(); + for (MCRegUnitIterator Unit(SuperPhysReg, TRI); Unit.isValid(); ++Unit) { + const LiveRange &UnitRange = LIS->getRegUnit(*Unit); + // If the regunit is live both before and after MI, + // we assume it is live through. + // Generally speaking, this is not true, because something like + // "RU = op RU" would match that description. + // However, we know that we are trying to assess whether + // a def of a virtual reg, vreg, is live at the same time of RU. + // If we are in the "RU = op RU" situation, that means that vreg + // is defined at the same time as RU (i.e., "vreg, RU = op RU"). + // Thus, vreg and RU interferes and vreg cannot be assigned to + // SuperPhysReg. Therefore, this situation cannot happen. + if (UnitRange.liveAt(AfterMIDefs) && UnitRange.liveAt(BeforeMIUses)) + return true; + } + return false; +} + void VirtRegRewriter::rewrite() { bool NoSubRegLiveness = !MRI->subRegLivenessEnabled(); SmallVector<unsigned, 8> SuperDeads; @@ -409,7 +479,8 @@ void VirtRegRewriter::rewrite() { // A virtual register kill refers to the whole register, so we may // have to add <imp-use,kill> operands for the super-register. A // partial redef always kills and redefines the super-register. - if (MO.readsReg() && (MO.isDef() || MO.isKill())) + if ((MO.readsReg() && (MO.isDef() || MO.isKill())) || + (MO.isDef() && subRegLiveThrough(*MI, PhysReg))) SuperKills.push_back(PhysReg); if (MO.isDef()) { @@ -431,12 +502,14 @@ void VirtRegRewriter::rewrite() { } } - // The <def,undef> flag only makes sense for sub-register defs, and - // we are substituting a full physreg. An <imp-use,kill> operand - // from the SuperKills list will represent the partial read of the - // super-register. - if (MO.isDef()) + // The <def,undef> and <def,internal> flags only make sense for + // sub-register defs, and we are substituting a full physreg. An + // <imp-use,kill> operand from the SuperKills list will represent the + // partial read of the super-register. + if (MO.isDef()) { MO.setIsUndef(false); + MO.setIsInternalRead(false); + } // PhysReg operands cannot have subregister indexes. PhysReg = TRI->getSubReg(PhysReg, SubReg); @@ -461,6 +534,8 @@ void VirtRegRewriter::rewrite() { DEBUG(dbgs() << "> " << *MI); + expandCopyBundle(*MI); + // We can remove identity copies right now. handleIdentityCopy(*MI); } diff --git a/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp b/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp index 568720c..c63a0a9 100644 --- a/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp +++ b/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp @@ -16,13 +16,13 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/Verifier.h" #include "llvm/MC/MCSymbol.h" @@ -54,7 +54,7 @@ namespace { class WinEHPrepare : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid. - WinEHPrepare(const TargetMachine *TM = nullptr) : FunctionPass(ID) {} + WinEHPrepare() : FunctionPass(ID) {} bool runOnFunction(Function &Fn) override; @@ -86,6 +86,7 @@ private: // All fields are reset by runOnFunction. EHPersonality Personality = EHPersonality::Unknown; + const DataLayout *DL = nullptr; DenseMap<BasicBlock *, ColorVector> BlockColors; MapVector<BasicBlock *, std::vector<BasicBlock *>> FuncletBlocks; }; @@ -93,12 +94,10 @@ private: } // end anonymous namespace char WinEHPrepare::ID = 0; -INITIALIZE_TM_PASS(WinEHPrepare, "winehprepare", "Prepare Windows exceptions", - false, false) +INITIALIZE_PASS(WinEHPrepare, DEBUG_TYPE, "Prepare Windows exceptions", + false, false) -FunctionPass *llvm::createWinEHPass(const TargetMachine *TM) { - return new WinEHPrepare(TM); -} +FunctionPass *llvm::createWinEHPass() { return new WinEHPrepare(); } bool WinEHPrepare::runOnFunction(Function &Fn) { if (!Fn.hasPersonalityFn()) @@ -111,6 +110,7 @@ bool WinEHPrepare::runOnFunction(Function &Fn) { if (!isFuncletEHPersonality(Personality)) return false; + DL = &Fn.getParent()->getDataLayout(); return prepareExplicitEH(Fn); } @@ -1070,7 +1070,7 @@ AllocaInst *WinEHPrepare::insertPHILoads(PHINode *PN, Function &F) { if (!isa<TerminatorInst>(EHPad)) { // If the EHPad isn't a terminator, then we can insert a load in this block // that will dominate all uses. - SpillSlot = new AllocaInst(PN->getType(), nullptr, + SpillSlot = new AllocaInst(PN->getType(), DL->getAllocaAddrSpace(), nullptr, Twine(PN->getName(), ".wineh.spillslot"), &F.getEntryBlock().front()); Value *V = new LoadInst(SpillSlot, Twine(PN->getName(), ".wineh.reload"), @@ -1157,7 +1157,7 @@ void WinEHPrepare::replaceUseWithLoad(Value *V, Use &U, AllocaInst *&SpillSlot, Function &F) { // Lazilly create the spill slot. if (!SpillSlot) - SpillSlot = new AllocaInst(V->getType(), nullptr, + SpillSlot = new AllocaInst(V->getType(), DL->getAllocaAddrSpace(), nullptr, Twine(V->getName(), ".wineh.spillslot"), &F.getEntryBlock().front()); diff --git a/contrib/llvm/lib/CodeGen/XRayInstrumentation.cpp b/contrib/llvm/lib/CodeGen/XRayInstrumentation.cpp index 63bd762..0b4c6e5 100644 --- a/contrib/llvm/lib/CodeGen/XRayInstrumentation.cpp +++ b/contrib/llvm/lib/CodeGen/XRayInstrumentation.cpp @@ -1,4 +1,4 @@ -//===-- XRayInstrumentation.cpp - Adds XRay instrumentation to functions. -===// +//===- XRayInstrumentation.cpp - Adds XRay instrumentation to functions. --===// // // The LLVM Compiler Infrastructure // @@ -14,18 +14,26 @@ // //===---------------------------------------------------------------------===// -#include "llvm/CodeGen/Analysis.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Triple.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/Support/TargetRegistry.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Function.h" +#include "llvm/Pass.h" #include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; namespace { + struct XRayInstrumentation : public MachineFunctionPass { static char ID; @@ -33,6 +41,14 @@ struct XRayInstrumentation : public MachineFunctionPass { initializeXRayInstrumentationPass(*PassRegistry::getPassRegistry()); } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<MachineLoopInfo>(); + AU.addPreserved<MachineLoopInfo>(); + AU.addPreserved<MachineDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + bool runOnMachineFunction(MachineFunction &MF) override; private: @@ -43,7 +59,7 @@ private: // This is the approach to go on CPUs which have a single RET instruction, // like x86/x86_64. void replaceRetWithPatchableRet(MachineFunction &MF, - const TargetInstrInfo *TII); + const TargetInstrInfo *TII); // Prepend the original return instruction with the exit sled code ("patchable // function exit" pseudo-instruction), preserving the original return @@ -54,13 +70,13 @@ private: // have to call the trampoline and return from it to the original return // instruction of the function being instrumented. void prependRetWithPatchableExit(MachineFunction &MF, - const TargetInstrInfo *TII); + const TargetInstrInfo *TII); }; -} // anonymous namespace -void XRayInstrumentation::replaceRetWithPatchableRet(MachineFunction &MF, - const TargetInstrInfo *TII) -{ +} // end anonymous namespace + +void XRayInstrumentation::replaceRetWithPatchableRet( + MachineFunction &MF, const TargetInstrInfo *TII) { // We look for *all* terminators and returns, then replace those with // PATCHABLE_RET instructions. SmallVector<MachineInstr *, 4> Terminators; @@ -81,7 +97,7 @@ void XRayInstrumentation::replaceRetWithPatchableRet(MachineFunction &MF, auto MIB = BuildMI(MBB, T, T.getDebugLoc(), TII->get(Opc)) .addImm(T.getOpcode()); for (auto &MO : T.operands()) - MIB.addOperand(MO); + MIB.add(MO); Terminators.push_back(&T); } } @@ -91,9 +107,8 @@ void XRayInstrumentation::replaceRetWithPatchableRet(MachineFunction &MF, I->eraseFromParent(); } -void XRayInstrumentation::prependRetWithPatchableExit(MachineFunction &MF, - const TargetInstrInfo *TII) -{ +void XRayInstrumentation::prependRetWithPatchableExit( + MachineFunction &MF, const TargetInstrInfo *TII) { for (auto &MBB : MF) { for (auto &T : MBB.terminators()) { unsigned Opc = 0; @@ -106,7 +121,7 @@ void XRayInstrumentation::prependRetWithPatchableExit(MachineFunction &MF, if (Opc != 0) { // Prepend the return instruction with PATCHABLE_FUNCTION_EXIT or // PATCHABLE_TAIL_CALL . - BuildMI(MBB, T, T.getDebugLoc(),TII->get(Opc)); + BuildMI(MBB, T, T.getDebugLoc(), TII->get(Opc)); } } } @@ -125,14 +140,24 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) { return false; // XRay threshold attribute not found. if (Attr.getValueAsString().getAsInteger(10, XRayThreshold)) return false; // Invalid value for threshold. - if (F.size() < XRayThreshold) - return false; // Function is too small. + + // Count the number of MachineInstr`s in MachineFunction + int64_t MICount = 0; + for (const auto& MBB : MF) + MICount += MBB.size(); + + // Check if we have a loop. + // FIXME: Maybe make this smarter, and see whether the loops are dependent + // on inputs or side-effects? + MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); + if (MLI.empty() && MICount < XRayThreshold) + return false; // Function is too small and has no loops. } // We look for the first non-empty MachineBasicBlock, so that we can insert // the function instrumentation in the appropriate place. - auto MBI = - find_if(MF, [&](const MachineBasicBlock &MBB) { return !MBB.empty(); }); + auto MBI = llvm::find_if( + MF, [&](const MachineBasicBlock &MBB) { return !MBB.empty(); }); if (MBI == MF.end()) return false; // The function is empty. @@ -142,12 +167,10 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) { if (!MF.getSubtarget().isXRaySupported()) { FirstMI.emitError("An attempt to perform XRay instrumentation for an" - " unsupported target."); + " unsupported target."); return false; } - // FIXME: Do the loop triviality analysis here or in an earlier pass. - // First, insert an PATCHABLE_FUNCTION_ENTER as the first instruction of the // MachineFunction. BuildMI(FirstMBB, FirstMI, FirstMI.getDebugLoc(), @@ -157,6 +180,11 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) { case Triple::ArchType::arm: case Triple::ArchType::thumb: case Triple::ArchType::aarch64: + case Triple::ArchType::ppc64le: + case Triple::ArchType::mips: + case Triple::ArchType::mipsel: + case Triple::ArchType::mips64: + case Triple::ArchType::mips64el: // For the architectures which don't have a single return instruction prependRetWithPatchableExit(MF, TII); break; @@ -171,5 +199,8 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) { char XRayInstrumentation::ID = 0; char &llvm::XRayInstrumentationID = XRayInstrumentation::ID; -INITIALIZE_PASS(XRayInstrumentation, "xray-instrumentation", "Insert XRay ops", - false, false) +INITIALIZE_PASS_BEGIN(XRayInstrumentation, "xray-instrumentation", + "Insert XRay ops", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_END(XRayInstrumentation, "xray-instrumentation", + "Insert XRay ops", false, false) |