diff options
Diffstat (limited to 'contrib/llvm/lib')
62 files changed, 826 insertions, 536 deletions
diff --git a/contrib/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/BasicAliasAnalysis.cpp index 68f766e..3586354 100644 --- a/contrib/llvm/lib/Analysis/BasicAliasAnalysis.cpp +++ b/contrib/llvm/lib/Analysis/BasicAliasAnalysis.cpp @@ -206,14 +206,6 @@ static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset, return V; } - if (ConstantInt *Const = dyn_cast<ConstantInt>(V)) { - // if it's a constant, just convert it to an offset - // and remove the variable. - Offset += Const->getValue(); - assert(Scale == 0 && "Constant values don't have a scale"); - return V; - } - if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(V)) { if (ConstantInt *RHSC = dyn_cast<ConstantInt>(BOp->getOperand(1))) { switch (BOp->getOpcode()) { @@ -261,10 +253,7 @@ static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset, Value *Result = GetLinearExpression(CastOp, Scale, Offset, Extension, DL, Depth + 1, AC, DT); Scale = Scale.zext(OldWidth); - - // We have to sign-extend even if Extension == EK_ZeroExt as we can't - // decompose a sign extension (i.e. zext(x - 1) != zext(x) - zext(-1)). - Offset = Offset.sext(OldWidth); + Offset = Offset.zext(OldWidth); return Result; } @@ -1135,43 +1124,12 @@ AliasResult BasicAliasAnalysis::aliasGEP( } } + // Try to distinguish something like &A[i][1] against &A[42][0]. + // Grab the least significant bit set in any of the scales. if (!GEP1VariableIndices.empty()) { uint64_t Modulo = 0; - bool AllPositive = true; - for (unsigned i = 0, e = GEP1VariableIndices.size(); i != e; ++i) { - - // Try to distinguish something like &A[i][1] against &A[42][0]. - // Grab the least significant bit set in any of the scales. We - // don't need std::abs here (even if the scale's negative) as we'll - // be ^'ing Modulo with itself later. + for (unsigned i = 0, e = GEP1VariableIndices.size(); i != e; ++i) Modulo |= (uint64_t) GEP1VariableIndices[i].Scale; - - if (AllPositive) { - // If the Value could change between cycles, then any reasoning about - // the Value this cycle may not hold in the next cycle. We'll just - // give up if we can't determine conditions that hold for every cycle: - const Value *V = GEP1VariableIndices[i].V; - - bool SignKnownZero, SignKnownOne; - ComputeSignBit(const_cast<Value *>(V), SignKnownZero, SignKnownOne, *DL, - 0, AC1, nullptr, DT); - - // Zero-extension widens the variable, and so forces the sign - // bit to zero. - bool IsZExt = GEP1VariableIndices[i].Extension == EK_ZeroExt; - SignKnownZero |= IsZExt; - SignKnownOne &= !IsZExt; - - // If the variable begins with a zero then we know it's - // positive, regardless of whether the value is signed or - // unsigned. - int64_t Scale = GEP1VariableIndices[i].Scale; - AllPositive = - (SignKnownZero && Scale >= 0) || - (SignKnownOne && Scale < 0); - } - } - Modulo = Modulo ^ (Modulo & (Modulo - 1)); // We can compute the difference between the two addresses @@ -1182,12 +1140,6 @@ AliasResult BasicAliasAnalysis::aliasGEP( V2Size != MemoryLocation::UnknownSize && ModOffset >= V2Size && V1Size <= Modulo - ModOffset) return NoAlias; - - // If we know all the variables are positive, then GEP1 >= GEP1BasePtr. - // If GEP1BasePtr > V2 (GEP1BaseOffset > 0) then we know the pointers - // don't alias if V2Size can fit in the gap between V2 and GEP1BasePtr. - if (AllPositive && GEP1BaseOffset > 0 && V2Size <= (uint64_t) GEP1BaseOffset) - return NoAlias; } // Statically, we can see that the base objects are the same, but the diff --git a/contrib/llvm/lib/Analysis/IPA/GlobalsModRef.cpp b/contrib/llvm/lib/Analysis/IPA/GlobalsModRef.cpp index 18d45dd..28fb49c 100644 --- a/contrib/llvm/lib/Analysis/IPA/GlobalsModRef.cpp +++ b/contrib/llvm/lib/Analysis/IPA/GlobalsModRef.cpp @@ -440,30 +440,39 @@ void GlobalsModRef::AnalyzeCallGraph(CallGraph &CG, Module &M) { } // Scan the function bodies for explicit loads or stores. - for (unsigned i = 0, e = SCC.size(); i != e && FunctionEffect != ModRef; - ++i) - for (inst_iterator II = inst_begin(SCC[i]->getFunction()), - E = inst_end(SCC[i]->getFunction()); - II != E && FunctionEffect != ModRef; ++II) - if (LoadInst *LI = dyn_cast<LoadInst>(&*II)) { + for (auto *Node : SCC) { + if (FunctionEffect == ModRef) + break; // The mod/ref lattice saturates here. + for (Instruction &I : inst_range(Node->getFunction())) { + if (FunctionEffect == ModRef) + break; // The mod/ref lattice saturates here. + + // We handle calls specially because the graph-relevant aspects are + // handled above. + if (auto CS = CallSite(&I)) { + if (isAllocationFn(&I, TLI) || isFreeCall(&I, TLI)) { + // FIXME: It is completely unclear why this is necessary and not + // handled by the above graph code. + FunctionEffect |= ModRef; + } else if (Function *Callee = CS.getCalledFunction()) { + // The callgraph doesn't include intrinsic calls. + if (Callee->isIntrinsic()) { + ModRefBehavior Behaviour = + AliasAnalysis::getModRefBehavior(Callee); + FunctionEffect |= (Behaviour & ModRef); + } + } + continue; + } + + // All non-call instructions we use the primary predicates for whether + // thay read or write memory. + if (I.mayReadFromMemory()) FunctionEffect |= Ref; - if (LI->isVolatile()) - // Volatile loads may have side-effects, so mark them as writing - // memory (for example, a flag inside the processor). - FunctionEffect |= Mod; - } else if (StoreInst *SI = dyn_cast<StoreInst>(&*II)) { + if (I.mayWriteToMemory()) FunctionEffect |= Mod; - if (SI->isVolatile()) - // Treat volatile stores as reading memory somewhere. - FunctionEffect |= Ref; - } else if (isAllocationFn(&*II, TLI) || isFreeCall(&*II, TLI)) { - FunctionEffect |= ModRef; - } else if (IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(&*II)) { - // The callgraph doesn't include intrinsic calls. - Function *Callee = Intrinsic->getCalledFunction(); - ModRefBehavior Behaviour = AliasAnalysis::getModRefBehavior(Callee); - FunctionEffect |= (Behaviour & ModRef); - } + } + } if ((FunctionEffect & Mod) == 0) ++NumReadMemFunctions; diff --git a/contrib/llvm/lib/Analysis/InstructionSimplify.cpp b/contrib/llvm/lib/Analysis/InstructionSimplify.cpp index fa42b48..a7f8f5c 100644 --- a/contrib/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/contrib/llvm/lib/Analysis/InstructionSimplify.cpp @@ -3574,18 +3574,9 @@ static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const Query &, // If extracting a specified index from the vector, see if we can recursively // find a previously computed scalar that was inserted into the vector. - if (auto *IdxC = dyn_cast<ConstantInt>(Idx)) { - unsigned IndexVal = IdxC->getZExtValue(); - unsigned VectorWidth = Vec->getType()->getVectorNumElements(); - - // If this is extracting an invalid index, turn this into undef, to avoid - // crashing the code below. - if (IndexVal >= VectorWidth) - return UndefValue::get(Vec->getType()->getVectorElementType()); - - if (Value *Elt = findScalarElement(Vec, IndexVal)) + if (auto *IdxC = dyn_cast<ConstantInt>(Idx)) + if (Value *Elt = findScalarElement(Vec, IdxC->getZExtValue())) return Elt; - } return nullptr; } diff --git a/contrib/llvm/lib/Analysis/PHITransAddr.cpp b/contrib/llvm/lib/Analysis/PHITransAddr.cpp index 8d80c60..f7545ea 100644 --- a/contrib/llvm/lib/Analysis/PHITransAddr.cpp +++ b/contrib/llvm/lib/Analysis/PHITransAddr.cpp @@ -374,9 +374,10 @@ InsertPHITranslatedSubExpr(Value *InVal, BasicBlock *CurBB, if (!Tmp.PHITranslateValue(CurBB, PredBB, &DT, /*MustDominate=*/true)) return Tmp.getAddr(); - // If we don't have an available version of this value, it must be an - // instruction. - Instruction *Inst = cast<Instruction>(InVal); + // We don't need to PHI translate values which aren't instructions. + auto *Inst = dyn_cast<Instruction>(InVal); + if (!Inst) + return nullptr; // Handle cast of PHI translatable value. if (CastInst *Cast = dyn_cast<CastInst>(Inst)) { diff --git a/contrib/llvm/lib/Analysis/VectorUtils.cpp b/contrib/llvm/lib/Analysis/VectorUtils.cpp index 67f68dc..8c671ef 100644 --- a/contrib/llvm/lib/Analysis/VectorUtils.cpp +++ b/contrib/llvm/lib/Analysis/VectorUtils.cpp @@ -402,8 +402,9 @@ llvm::Value *llvm::findScalarElement(llvm::Value *V, unsigned EltNo) { if (match(V, llvm::PatternMatch::m_Add(llvm::PatternMatch::m_Value(Val), llvm::PatternMatch::m_Constant(Con)))) { - if (Con->getAggregateElement(EltNo)->isNullValue()) - return findScalarElement(Val, EltNo); + if (Constant *Elt = Con->getAggregateElement(EltNo)) + if (Elt->isNullValue()) + return findScalarElement(Val, EltNo); } // Otherwise, we don't know. diff --git a/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp b/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp index 201f9c1..5b09cf1 100644 --- a/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp +++ b/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp @@ -733,14 +733,12 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) { // If no relevant registers are used in the function, we can skip it // completely. bool anyregs = false; - const MachineRegisterInfo &MRI = mf.getRegInfo(); for (TargetRegisterClass::const_iterator I = RC->begin(), E = RC->end(); - I != E && !anyregs; ++I) - for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) - if (!MRI.reg_nodbg_empty(*AI)) { - anyregs = true; - break; - } + I != E; ++I) + if (MF->getRegInfo().isPhysRegUsed(*I)) { + anyregs = true; + break; + } if (!anyregs) return false; // Initialize the AliasMap on the first use. diff --git a/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp b/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp index 000151a..9ea031d 100644 --- a/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp +++ b/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp @@ -15,12 +15,12 @@ #include "RegisterCoalescer.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/VirtRegMap.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetRegisterInfo.h" -#include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; @@ -49,6 +49,7 @@ void LiveRegMatrix::getAnalysisUsage(AnalysisUsage &AU) const { bool LiveRegMatrix::runOnMachineFunction(MachineFunction &MF) { TRI = MF.getSubtarget().getRegisterInfo(); + MRI = &MF.getRegInfo(); LIS = &getAnalysis<LiveIntervals>(); VRM = &getAnalysis<VirtRegMap>(); @@ -100,6 +101,7 @@ void LiveRegMatrix::assign(LiveInterval &VirtReg, unsigned PhysReg) { << " to " << PrintReg(PhysReg, TRI) << ':'); assert(!VRM->hasPhys(VirtReg.reg) && "Duplicate VirtReg assignment"); VRM->assignVirt2Phys(VirtReg.reg, PhysReg); + MRI->setPhysRegUsed(PhysReg); foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit, const LiveRange &Range) { diff --git a/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp index 5984af8..e883ce5 100644 --- a/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -29,6 +29,7 @@ MachineRegisterInfo::MachineRegisterInfo(const MachineFunction *MF) TracksSubRegLiveness(false) { VRegInfo.reserve(256); RegAllocHints.reserve(256); + UsedRegUnits.resize(getTargetRegisterInfo()->getNumRegUnits()); UsedPhysRegMask.resize(getTargetRegisterInfo()->getNumRegs()); // Create the physreg use/def lists. diff --git a/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp index 9404c68..d9a6b684 100644 --- a/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp +++ b/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp @@ -624,6 +624,10 @@ struct DataDep { static bool getDataDeps(const MachineInstr *UseMI, SmallVectorImpl<DataDep> &Deps, const MachineRegisterInfo *MRI) { + // Debug values should not be included in any calculations. + if (UseMI->isDebugValue()) + return false; + bool HasPhysRegs = false; for (MachineInstr::const_mop_iterator I = UseMI->operands_begin(), E = UseMI->operands_end(); I != E; ++I) { diff --git a/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp index b2fdee6..6ca69a1 100644 --- a/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -1026,8 +1026,12 @@ PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) { // Replace this reference to the virtual register with the // scratch register. assert (ScratchReg && "Missing scratch register!"); + MachineRegisterInfo &MRI = Fn.getRegInfo(); Fn.getRegInfo().replaceRegWith(Reg, ScratchReg); + // Make sure MRI now accounts this register as used. + MRI.setPhysRegUsed(ScratchReg); + // Because this instruction was processed by the RS before this // register was allocated, make sure that the RS now records the // register as being used. diff --git a/contrib/llvm/lib/CodeGen/RegAllocFast.cpp b/contrib/llvm/lib/CodeGen/RegAllocFast.cpp index 660bb4f..fd3d4d7 100644 --- a/contrib/llvm/lib/CodeGen/RegAllocFast.cpp +++ b/contrib/llvm/lib/CodeGen/RegAllocFast.cpp @@ -986,6 +986,10 @@ void RAFast::AllocateBasicBlock() { } } + for (UsedInInstrSet::iterator + I = UsedInInstr.begin(), E = UsedInInstr.end(); I != E; ++I) + MRI->setRegUnitUsed(*I); + // Track registers defined by instruction - early clobbers and tied uses at // this point. UsedInInstr.clear(); @@ -1046,6 +1050,10 @@ void RAFast::AllocateBasicBlock() { killVirtReg(VirtDead[i]); VirtDead.clear(); + for (UsedInInstrSet::iterator + I = UsedInInstr.begin(), E = UsedInInstr.end(); I != E; ++I) + MRI->setRegUnitUsed(*I); + if (CopyDst && CopyDst == CopySrc && CopyDstSub == CopySrcSub) { DEBUG(dbgs() << "-- coalescing: " << *MI); Coalesced.push_back(MI); @@ -1095,6 +1103,12 @@ bool RAFast::runOnMachineFunction(MachineFunction &Fn) { AllocateBasicBlock(); } + // Add the clobber lists for all the instructions we skipped earlier. + for (const MCInstrDesc *Desc : SkippedInstrs) + if (const uint16_t *Defs = Desc->getImplicitDefs()) + while (*Defs) + MRI->setPhysRegUsed(*Defs++); + // All machine operands and other references to virtual registers have been // replaced. Remove the virtual registers. MRI->clearVirtRegs(); diff --git a/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp b/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp index 7afea2a..c911b9b 100644 --- a/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -1531,6 +1531,14 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) { DEBUG(dbgs() << "\t\tInterference (read): " << *MI); return false; } + + // We must also check for clobbers caused by regmasks. + for (const auto &MO : MI->operands()) { + if (MO.isRegMask() && MO.clobbersPhysReg(DstReg)) { + DEBUG(dbgs() << "\t\tInterference (regmask clobber): " << *MI); + return false; + } + } } // We're going to remove the copy which defines a physical reserved diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 52d620b..3b29306 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -8365,12 +8365,12 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { if (N0CFP && N0CFP->isExactlyValue(1.0)) return SDValue(); - SmallVector<SDNode *, 4> Users; // Find all FDIV users of the same divisor. - for (auto *U : N1->uses()) { + // Use a set because duplicates may be present in the user list. + SetVector<SDNode *> Users; + for (auto *U : N1->uses()) if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) - Users.push_back(U); - } + Users.insert(U); if (TLI.combineRepeatedFPDivisors(Users.size())) { SDValue FPOne = DAG.getConstantFP(1.0, DL, VT); diff --git a/contrib/llvm/lib/CodeGen/VirtRegMap.cpp b/contrib/llvm/lib/CodeGen/VirtRegMap.cpp index 02341b4..2912bdd 100644 --- a/contrib/llvm/lib/CodeGen/VirtRegMap.cpp +++ b/contrib/llvm/lib/CodeGen/VirtRegMap.cpp @@ -163,6 +163,7 @@ class VirtRegRewriter : public MachineFunctionPass { SlotIndexes *Indexes; LiveIntervals *LIS; VirtRegMap *VRM; + SparseSet<unsigned> PhysRegs; void rewrite(); void addMBBLiveIns(); @@ -318,15 +319,54 @@ void VirtRegRewriter::rewrite() { SmallVector<unsigned, 8> SuperDeads; SmallVector<unsigned, 8> SuperDefs; SmallVector<unsigned, 8> SuperKills; + SmallPtrSet<const MachineInstr *, 4> NoReturnInsts; + + // Here we have a SparseSet to hold which PhysRegs are actually encountered + // in the MF we are about to iterate over so that later when we call + // setPhysRegUsed, we are only doing it for physRegs that were actually found + // in the program and not for all of the possible physRegs for the given + // target architecture. If the target has a lot of physRegs, then for a small + // program there will be a significant compile time reduction here. + PhysRegs.clear(); + PhysRegs.setUniverse(TRI->getNumRegs()); + + // The function with uwtable should guarantee that the stack unwinder + // can unwind the stack to the previous frame. Thus, we can't apply the + // noreturn optimization if the caller function has uwtable attribute. + bool HasUWTable = MF->getFunction()->hasFnAttribute(Attribute::UWTable); for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end(); MBBI != MBBE; ++MBBI) { DEBUG(MBBI->print(dbgs(), Indexes)); + bool IsExitBB = MBBI->succ_empty(); for (MachineBasicBlock::instr_iterator MII = MBBI->instr_begin(), MIE = MBBI->instr_end(); MII != MIE;) { MachineInstr *MI = MII; ++MII; + // Check if this instruction is a call to a noreturn function. If this + // is a call to noreturn function and we don't need the stack unwinding + // functionality (i.e. this function does not have uwtable attribute and + // the callee function has the nounwind attribute), then we can ignore + // the definitions set by this instruction. + if (!HasUWTable && IsExitBB && MI->isCall()) { + for (MachineInstr::mop_iterator MOI = MI->operands_begin(), + MOE = MI->operands_end(); MOI != MOE; ++MOI) { + MachineOperand &MO = *MOI; + if (!MO.isGlobal()) + continue; + const Function *Func = dyn_cast<Function>(MO.getGlobal()); + if (!Func || !Func->hasFnAttribute(Attribute::NoReturn) || + // We need to keep correct unwind information + // even if the function will not return, since the + // runtime may need it. + !Func->hasFnAttribute(Attribute::NoUnwind)) + continue; + NoReturnInsts.insert(MI); + break; + } + } + for (MachineInstr::mop_iterator MOI = MI->operands_begin(), MOE = MI->operands_end(); MOI != MOE; ++MOI) { MachineOperand &MO = *MOI; @@ -335,6 +375,15 @@ void VirtRegRewriter::rewrite() { if (MO.isRegMask()) MRI->addPhysRegsUsedFromRegMask(MO.getRegMask()); + // If we encounter a VirtReg or PhysReg then get at the PhysReg and add + // it to the physreg bitset. Later we use only the PhysRegs that were + // actually encountered in the MF to populate the MRI's used physregs. + if (MO.isReg() && MO.getReg()) + PhysRegs.insert( + TargetRegisterInfo::isVirtualRegister(MO.getReg()) ? + VRM->getPhys(MO.getReg()) : + MO.getReg()); + if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg())) continue; unsigned VirtReg = MO.getReg(); @@ -421,5 +470,29 @@ void VirtRegRewriter::rewrite() { } } } + + // Tell MRI about physical registers in use. + if (NoReturnInsts.empty()) { + for (SparseSet<unsigned>::iterator + RegI = PhysRegs.begin(), E = PhysRegs.end(); RegI != E; ++RegI) + if (!MRI->reg_nodbg_empty(*RegI)) + MRI->setPhysRegUsed(*RegI); + } else { + for (SparseSet<unsigned>::iterator + I = PhysRegs.begin(), E = PhysRegs.end(); I != E; ++I) { + unsigned Reg = *I; + if (MRI->reg_nodbg_empty(Reg)) + continue; + // Check if this register has a use that will impact the rest of the + // code. Uses in debug and noreturn instructions do not impact the + // generated code. + for (MachineInstr &It : MRI->reg_nodbg_instructions(Reg)) { + if (!NoReturnInsts.count(&It)) { + MRI->setPhysRegUsed(Reg); + break; + } + } + } + } } diff --git a/contrib/llvm/lib/ExecutionEngine/ExecutionEngine.cpp b/contrib/llvm/lib/ExecutionEngine/ExecutionEngine.cpp index c2ff8e2..67a1ca6 100644 --- a/contrib/llvm/lib/ExecutionEngine/ExecutionEngine.cpp +++ b/contrib/llvm/lib/ExecutionEngine/ExecutionEngine.cpp @@ -180,10 +180,17 @@ uint64_t ExecutionEngineState::RemoveMapping(StringRef Name) { } std::string ExecutionEngine::getMangledName(const GlobalValue *GV) { + assert(GV->hasName() && "Global must have name."); + MutexGuard locked(lock); - Mangler Mang; SmallString<128> FullName; - Mang.getNameWithPrefix(FullName, GV, false); + + const DataLayout &DL = + GV->getParent()->getDataLayout().isDefault() + ? *getDataLayout() + : GV->getParent()->getDataLayout(); + + Mangler::getNameWithPrefix(FullName, GV->getName(), DL); return FullName.str(); } diff --git a/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp index a7d6705..f6944ee 100644 --- a/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp +++ b/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp @@ -266,6 +266,12 @@ void MCJIT::finalizeModule(Module *M) { RuntimeDyld::SymbolInfo MCJIT::findExistingSymbol(const std::string &Name) { SmallString<128> FullName; Mangler::getNameWithPrefix(FullName, Name, *TM->getDataLayout()); + + if (void *Addr = getPointerToGlobalIfAvailable(FullName)) + return RuntimeDyld::SymbolInfo(static_cast<uint64_t>( + reinterpret_cast<uintptr_t>(Addr)), + JITSymbolFlags::Exported); + return Dyld.getSymbol(FullName); } diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp index 044eee4..ecd9900 100644 --- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp +++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp @@ -98,7 +98,7 @@ void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) { // On OS X OS X __register_frame takes a single FDE as an argument. - // See http://lists.cs.uiuc.edu/pipermail/llvmdev/2013-April/061768.html + // See http://lists.llvm.org/pipermail/llvm-dev/2013-April/061768.html const char *P = (const char *)Addr; const char *End = P + Size; do { diff --git a/contrib/llvm/lib/IR/Type.cpp b/contrib/llvm/lib/IR/Type.cpp index b5c4e5d..a9ca800 100644 --- a/contrib/llvm/lib/IR/Type.cpp +++ b/contrib/llvm/lib/IR/Type.cpp @@ -613,6 +613,9 @@ bool StructType::isLayoutIdentical(StructType *Other) const { if (isPacked() != Other->isPacked() || getNumElements() != Other->getNumElements()) return false; + + if (!getNumElements()) + return true; return std::equal(element_begin(), element_end(), Other->element_begin()); } diff --git a/contrib/llvm/lib/Support/MemoryBuffer.cpp b/contrib/llvm/lib/Support/MemoryBuffer.cpp index 98862e9..d09ef3a 100644 --- a/contrib/llvm/lib/Support/MemoryBuffer.cpp +++ b/contrib/llvm/lib/Support/MemoryBuffer.cpp @@ -57,7 +57,8 @@ void MemoryBuffer::init(const char *BufStart, const char *BufEnd, /// CopyStringRef - Copies contents of a StringRef into a block of memory and /// null-terminates it. static void CopyStringRef(char *Memory, StringRef Data) { - memcpy(Memory, Data.data(), Data.size()); + if (!Data.empty()) + memcpy(Memory, Data.data(), Data.size()); Memory[Data.size()] = 0; // Null terminate string. } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp index 79a84ad..9d6dbd6 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -593,6 +593,7 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C, if (Change) { Substs[MO.getReg()] = Reg; MO.setReg(Reg); + MRI->setPhysRegUsed(Reg); Changed = true; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index a7817f4..a76473f 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -354,6 +354,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (NumBytes && NeedsRealignment) { // Use the first callee-saved register as a scratch register. scratchSPReg = AArch64::X9; + MF.getRegInfo().setPhysRegUsed(scratchSPReg); } // If we're a leaf function, try using the red zone. diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td index ef8ef62..68b5050 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -123,6 +123,11 @@ def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", "true", "VI SGPR initilization bug requiring a fixed SGPR allocation size">; +def FeatureEnableHugeScratchBuffer : SubtargetFeature<"huge-scratch-buffer", + "EnableHugeScratchBuffer", + "true", + "Enable scratch buffer sizes greater than 128 GB">; + class SubtargetFeatureFetchLimit <string Value> : SubtargetFeature <"fetch"#Value, "TexVTXClauseSize", diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 37b77d7..64c54cc 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1029,6 +1029,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &SLC, SDValue &TFE) const { SDValue Ptr, Offen, Idxen, Addr64; + // addr64 bit was removed for volcanic islands. + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return false; + SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, GLC, SLC, TFE); @@ -1095,13 +1099,16 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, // (add n0, c1) if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); - ConstantSDNode *C1 = cast<ConstantSDNode>(N1); - - if (isLegalMUBUFImmOffset(C1)) { - VAddr = Addr.getOperand(0); - ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); - return true; + // Offsets in vaddr must be positive. + if (CurDAG->SignBitIsZero(N0)) { + ConstantSDNode *C1 = cast<ConstantSDNode>(N1); + if (isLegalMUBUFImmOffset(C1)) { + VAddr = N0; + ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); + return true; + } } } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index bd5abc4..5f32a65 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -73,7 +73,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false), GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0), - IsaVersion(ISAVersion0_0_0), + IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false), FrameLowering(TargetFrameLowering::StackGrowsUp, 64 * 16, // Maximum stack alignment (long16) 0), diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 90831bf..735f01d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -89,6 +89,7 @@ private: bool FeatureDisable; int LDSBankCount; unsigned IsaVersion; + bool EnableHugeScratchBuffer; AMDGPUFrameLowering FrameLowering; std::unique_ptr<AMDGPUTargetLowering> TLInfo; @@ -271,6 +272,10 @@ public: return DevName; } + bool enableHugeScratchBuffer() const { + return EnableHugeScratchBuffer; + } + bool dumpCode() const { return DumpCode; } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index c9b25a1..d918ac3 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -1719,7 +1719,6 @@ MachineBasicBlock * AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) { MachineBasicBlock *LoopHeader = LoopRep->getHeader(); MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch(); - const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); if (!LoopHeader || !LoopLatch) return nullptr; @@ -1732,18 +1731,9 @@ AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) { FuncRep->push_back(DummyExitBlk); //insert to function SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: "); DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";); - MachineBasicBlock::iterator I = BranchMI; - unsigned ImmReg = FuncRep->getRegInfo().createVirtualRegister(I32RC); - llvm_unreachable("Extra register needed to handle CFG"); - MachineInstr *NewMI = insertInstrBefore(I, AMDGPU::BRANCH_COND_i32); - MachineInstrBuilder MIB(*FuncRep, NewMI); - MIB.addMBB(LoopHeader); - MIB.addReg(ImmReg, false); - SHOWNEWINSTR(NewMI); - BranchMI->eraseFromParent(); - LoopLatch->addSuccessor(DummyExitBlk); - - return DummyExitBlk; + LLVMContext &Ctx = LoopHeader->getParent()->getFunction()->getContext(); + Ctx.emitError("Extra register needed to handle CFG"); + return nullptr; } void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) { diff --git a/contrib/llvm/lib/Target/AMDGPU/Processors.td b/contrib/llvm/lib/Target/AMDGPU/Processors.td index 69efb8b..d9a0723 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Processors.td +++ b/contrib/llvm/lib/Target/AMDGPU/Processors.td @@ -138,3 +138,7 @@ def : ProcessorModel<"iceland", SIQuarterSpeedModel, def : ProcessorModel<"carrizo", SIQuarterSpeedModel, [FeatureVolcanicIslands, FeatureISAVersion8_0_1] >; + +def : ProcessorModel<"fiji", SIQuarterSpeedModel, + [FeatureVolcanicIslands, FeatureISAVersion8_0_1] +>; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index dd818a9..099b0b1 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -254,6 +254,12 @@ bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, return false; } +bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { + // Flat instructions do not have offsets, and only have the register + // address. + return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1); +} + bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { @@ -263,8 +269,21 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, switch (AS) { case AMDGPUAS::GLOBAL_ADDRESS: - case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions? + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // Assume the we will use FLAT for all global memory accesses + // on VI. + // FIXME: This assumption is currently wrong. On VI we still use + // MUBUF instructions for the r + i addressing mode. As currently + // implemented, the MUBUF instructions only work on buffer < 4GB. + // It may be possible to support > 4GB buffers with MUBUF instructions, + // by setting the stride value in the resource descriptor which would + // increase the size limit to (stride * 4GB). However, this is risky, + // because it has never been validated. + return isLegalFlatAddressingMode(AM); + } + // fall-through case AMDGPUAS::PRIVATE_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions? case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: { // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and // additionally can do r + r + i with addr64. 32-bit has more addressing @@ -324,11 +343,9 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, return false; } - case AMDGPUAS::FLAT_ADDRESS: { - // Flat instructions do not have offsets, and only have the register - // address. - return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1); - } + case AMDGPUAS::FLAT_ADDRESS: + return isLegalFlatAddressingMode(AM); + default: llvm_unreachable("unhandled address space"); } @@ -812,10 +829,29 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) { SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op); unsigned FrameIndex = FINode->getIndex(); - return DAG.getTargetFrameIndex(FrameIndex, MVT::i32); + // A FrameIndex node represents a 32-bit offset into scratch memory. If + // the high bit of a frame index offset were to be set, this would mean + // that it represented an offset of ~2GB * 64 = ~128GB from the start of the + // scratch buffer, with 64 being the number of threads per wave. + // + // If we know the machine uses less than 128GB of scratch, then we can + // amrk the high bit of the FrameIndex node as known zero, + // which is important, because it means in most situations we can + // prove that values derived from FrameIndex nodes are non-negative. + // This enables us to take advantage of more addressing modes when + // accessing scratch buffers, since for scratch reads/writes, the register + // offset must always be positive. + + SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32); + if (Subtarget->enableHugeScratchBuffer()) + return TFI; + + return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI, + DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31))); } /// This transforms the control flow intrinsics to get the branch destination as @@ -2034,6 +2070,13 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, } } +static bool isFrameIndexOp(SDValue Op) { + if (Op.getOpcode() == ISD::AssertZext) + Op = Op.getOperand(0); + + return isa<FrameIndexSDNode>(Op); +} + /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG) /// with frame index operands. /// LLVM assumes that inputs are to these instructions are registers. @@ -2042,7 +2085,7 @@ void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, SmallVector<SDValue, 8> Ops; for (unsigned i = 0; i < Node->getNumOperands(); ++i) { - if (!isa<FrameIndexSDNode>(Node->getOperand(i))) { + if (!isFrameIndexOp(Node->getOperand(i))) { Ops.push_back(Node->getOperand(i)); continue; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h index 635b4ed..d84c32e 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -56,6 +56,7 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; + bool isLegalFlatAddressingMode(const AddrMode &AM) const; public: SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI); diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td index b39a787..8d8110b 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1600,12 +1600,14 @@ multiclass VOPC_m <vopc op, dag outs, dag ins, string asm, list<dag> pattern, SIMCInstr <opName#"_e32", SISubtarget.SI> { let Defs = !if(DefExec, [EXEC], []); let hasSideEffects = DefExec; + let AssemblerPredicates = [isSICI]; } def _vi : VOPC<op.VI, ins, asm, []>, SIMCInstr <opName#"_e32", SISubtarget.VI> { let Defs = !if(DefExec, [EXEC], []); let hasSideEffects = DefExec; + let AssemblerPredicates = [isVI]; } } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td index 1ee63c6..f78ffd7 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2910,9 +2910,6 @@ defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32, constant_load>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32, constant_load>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, constant_load>; } // End Predicates = [isSICI] class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat < @@ -3273,13 +3270,13 @@ def : Pat < (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), (V_CNDMASK_B64_PSEUDO - $x, (V_MIN_F64 SRCMODS.NONE, (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE), SRCMODS.NONE, (V_MOV_B64_PSEUDO 0x3fefffffffffffff), DSTCLAMP.NONE, DSTOMOD.NONE), + $x, (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)) >; @@ -3291,13 +3288,13 @@ def : Pat < $x, SRCMODS.NEG, (V_CNDMASK_B64_PSEUDO - $x, (V_MIN_F64 SRCMODS.NONE, (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE), SRCMODS.NONE, (V_MOV_B64_PSEUDO 0x3fefffffffffffff), DSTCLAMP.NONE, DSTOMOD.NONE), + $x, (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)), DSTCLAMP.NONE, DSTOMOD.NONE) >; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index d23b92e..587ea63 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -53,6 +53,7 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( if (!LaneVGPRs.count(LaneVGPRIdx)) { unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass); LaneVGPRs[LaneVGPRIdx] = LaneVGPR; + MRI.setPhysRegUsed(LaneVGPR); // Add this register as live-in to all blocks to avoid machine verifer // complaining about use of an undefined physical register. diff --git a/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp index b086d2e..0a7f684 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp @@ -91,6 +91,7 @@ bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) { if (ScratchOffsetReg != AMDGPU::NoRegister) { // Found an SGPR to use + MRI.setPhysRegUsed(ScratchOffsetReg); BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg) .addReg(ScratchOffsetPreloadReg); } else { diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index ce4acaf..54c4d54 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -348,7 +348,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { &AMDGPU::SReg_128RegClass, &AMDGPU::VReg_256RegClass, &AMDGPU::SReg_256RegClass, - &AMDGPU::VReg_512RegClass + &AMDGPU::VReg_512RegClass, + &AMDGPU::SReg_512RegClass }; for (const TargetRegisterClass *BaseClass : BaseClasses) { @@ -499,7 +500,7 @@ unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); I != E; ++I) { - if (MRI.reg_nodbg_empty(*I)) + if (!MRI.isPhysRegUsed(*I)) return *I; } return AMDGPU::NoRegister; diff --git a/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td index 5bf86e6..aca4673 100644 --- a/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td @@ -103,4 +103,46 @@ def : Pat < (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) >; +// Patterns for global loads with no offset +class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < + (vt (node i64:$addr)), + (inst $addr, 0, 0, 0) +>; + +def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_global, i32>; +def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_global, i32>; +def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_global, i32>; +def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_global, i32>; +def : FlatLoadPat <FLAT_LOAD_DWORD, global_load, i32>; +def : FlatLoadPat <FLAT_LOAD_DWORDX2, global_load, v2i32>; +def : FlatLoadPat <FLAT_LOAD_DWORDX4, global_load, v4i32>; + +class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < + (node vt:$data, i64:$addr), + (inst $data, $addr, 0, 0, 0) +>; + +def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_global, i32>; +def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_global, i32>; +def : FlatStorePat <FLAT_STORE_DWORD, global_store, i32>; +def : FlatStorePat <FLAT_STORE_DWORDX2, global_store, v2i32>; +def : FlatStorePat <FLAT_STORE_DWORDX4, global_store, v4i32>; + +class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < + (vt (node i64:$addr, vt:$data)), + (inst $addr, $data, 0, 0) +>; + +def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>; + + } // End Predicates = [isVI] diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp index e335784..8cc06df 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -4583,6 +4583,12 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); SDLoc dl(Op); + if (CmpVT.getVectorElementType() == MVT::i64) + // 64-bit comparisons are not legal. We've marked SETCC as non-Custom, + // but it's possible that our operands are 64-bit but our result is 32-bit. + // Bail in this case. + return SDValue(); + if (Op1.getValueType().isFloatingPoint()) { switch (SetCCOpcode) { default: llvm_unreachable("Illegal FP comparison"); diff --git a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 3735281..265b86f 100644 --- a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -118,7 +118,6 @@ namespace { }; SpecificBumpPtrAllocator<MergeCandidate> Allocator; SmallVector<const MergeCandidate*,4> Candidates; - SmallVector<MachineInstr*,4> MergeBaseCandidates; void moveLiveRegsBefore(const MachineBasicBlock &MBB, MachineBasicBlock::const_iterator Before); @@ -141,7 +140,6 @@ namespace { MachineBasicBlock::iterator &MBBI); bool MergeBaseUpdateLoadStore(MachineInstr *MI); bool MergeBaseUpdateLSMultiple(MachineInstr *MI); - bool MergeBaseUpdateLSDouble(MachineInstr &MI) const; bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); bool MergeReturnIntoLDM(MachineBasicBlock &MBB); }; @@ -933,6 +931,11 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) { if (STI->isSwift() && !isNotVFP && (PRegNum % 2) == 1) CanMergeToLSMulti = false; + // LDRD/STRD do not allow SP/PC. LDM/STM do not support it or have it + // deprecated; LDM to PC is fine but cannot happen here. + if (PReg == ARM::SP || PReg == ARM::PC) + CanMergeToLSMulti = CanMergeToLSDouble = false; + // Merge following instructions where possible. for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) { int NewOffset = MemOps[I].Offset; @@ -940,16 +943,15 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) { break; const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI); unsigned Reg = MO.getReg(); - unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg); + if (Reg == ARM::SP || Reg == ARM::PC) + break; // See if the current load/store may be part of a multi load/store. + unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg); bool PartOfLSMulti = CanMergeToLSMulti; if (PartOfLSMulti) { - // Cannot load from SP - if (Reg == ARM::SP) - PartOfLSMulti = false; // Register numbers must be in ascending order. - else if (RegNum <= PRegNum) + if (RegNum <= PRegNum) PartOfLSMulti = false; // For VFP / NEON load/store multiples, the registers must be // consecutive and within the limit on the number of registers per @@ -993,6 +995,76 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) { } while (SIndex < EIndex); } +static bool isMatchingDecrement(MachineInstr *MI, unsigned Base, + unsigned Bytes, unsigned Limit, + ARMCC::CondCodes Pred, unsigned PredReg) { + unsigned MyPredReg = 0; + if (!MI) + return false; + + bool CheckCPSRDef = false; + switch (MI->getOpcode()) { + default: return false; + case ARM::tSUBi8: + case ARM::t2SUBri: + case ARM::SUBri: + CheckCPSRDef = true; + break; + case ARM::tSUBspi: + break; + } + + // Make sure the offset fits in 8 bits. + if (Bytes == 0 || (Limit && Bytes >= Limit)) + return false; + + unsigned Scale = (MI->getOpcode() == ARM::tSUBspi || + MI->getOpcode() == ARM::tSUBi8) ? 4 : 1; // FIXME + if (!(MI->getOperand(0).getReg() == Base && + MI->getOperand(1).getReg() == Base && + (MI->getOperand(2).getImm() * Scale) == Bytes && + getInstrPredicate(MI, MyPredReg) == Pred && + MyPredReg == PredReg)) + return false; + + return CheckCPSRDef ? !definesCPSR(MI) : true; +} + +static bool isMatchingIncrement(MachineInstr *MI, unsigned Base, + unsigned Bytes, unsigned Limit, + ARMCC::CondCodes Pred, unsigned PredReg) { + unsigned MyPredReg = 0; + if (!MI) + return false; + + bool CheckCPSRDef = false; + switch (MI->getOpcode()) { + default: return false; + case ARM::tADDi8: + case ARM::t2ADDri: + case ARM::ADDri: + CheckCPSRDef = true; + break; + case ARM::tADDspi: + break; + } + + if (Bytes == 0 || (Limit && Bytes >= Limit)) + // Make sure the offset fits in 8 bits. + return false; + + unsigned Scale = (MI->getOpcode() == ARM::tADDspi || + MI->getOpcode() == ARM::tADDi8) ? 4 : 1; // FIXME + if (!(MI->getOperand(0).getReg() == Base && + MI->getOperand(1).getReg() == Base && + (MI->getOperand(2).getImm() * Scale) == Bytes && + getInstrPredicate(MI, MyPredReg) == Pred && + MyPredReg == PredReg)) + return false; + + return CheckCPSRDef ? !definesCPSR(MI) : true; +} + static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, ARM_AM::AMSubMode Mode) { switch (Opc) { @@ -1060,75 +1132,6 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, } } -/// Check if the given instruction increments or decrements a register and -/// return the amount it is incremented/decremented. Returns 0 if the CPSR flags -/// generated by the instruction are possibly read as well. -static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg, - ARMCC::CondCodes Pred, unsigned PredReg) { - bool CheckCPSRDef; - int Scale; - switch (MI.getOpcode()) { - case ARM::tADDi8: Scale = 4; CheckCPSRDef = true; break; - case ARM::tSUBi8: Scale = -4; CheckCPSRDef = true; break; - case ARM::t2SUBri: - case ARM::SUBri: Scale = -1; CheckCPSRDef = true; break; - case ARM::t2ADDri: - case ARM::ADDri: Scale = 1; CheckCPSRDef = true; break; - case ARM::tADDspi: Scale = 4; CheckCPSRDef = false; break; - case ARM::tSUBspi: Scale = -4; CheckCPSRDef = false; break; - default: return 0; - } - - unsigned MIPredReg; - if (MI.getOperand(0).getReg() != Reg || - MI.getOperand(1).getReg() != Reg || - getInstrPredicate(&MI, MIPredReg) != Pred || - MIPredReg != PredReg) - return 0; - - if (CheckCPSRDef && definesCPSR(&MI)) - return 0; - return MI.getOperand(2).getImm() * Scale; -} - -/// Searches for an increment or decrement of \p Reg before \p MBBI. -static MachineBasicBlock::iterator -findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg, - ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) { - Offset = 0; - MachineBasicBlock &MBB = *MBBI->getParent(); - MachineBasicBlock::iterator BeginMBBI = MBB.begin(); - MachineBasicBlock::iterator EndMBBI = MBB.end(); - if (MBBI == BeginMBBI) - return EndMBBI; - - // Skip debug values. - MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI); - while (PrevMBBI->isDebugValue() && PrevMBBI != BeginMBBI) - --PrevMBBI; - - Offset = isIncrementOrDecrement(*PrevMBBI, Reg, Pred, PredReg); - return Offset == 0 ? EndMBBI : PrevMBBI; -} - -/// Searches for a increment or decrement of \p Reg after \p MBBI. -static MachineBasicBlock::iterator -findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg, - ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) { - Offset = 0; - MachineBasicBlock &MBB = *MBBI->getParent(); - MachineBasicBlock::iterator EndMBBI = MBB.end(); - MachineBasicBlock::iterator NextMBBI = std::next(MBBI); - // Skip debug values. - while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) - ++NextMBBI; - if (NextMBBI == EndMBBI) - return EndMBBI; - - Offset = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg); - return Offset == 0 ? EndMBBI : NextMBBI; -} - /// Fold proceeding/trailing inc/dec of base register into the /// LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible: /// @@ -1148,6 +1151,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { const MachineOperand &BaseOP = MI->getOperand(0); unsigned Base = BaseOP.getReg(); bool BaseKill = BaseOP.isKill(); + unsigned Bytes = getLSMultipleTransferSize(MI); unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); unsigned Opcode = MI->getOpcode(); @@ -1159,24 +1163,49 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { if (MI->getOperand(i).getReg() == Base) return false; - int Bytes = getLSMultipleTransferSize(MI); + bool DoMerge = false; + ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode); + + // Try merging with the previous instruction. MachineBasicBlock &MBB = *MI->getParent(); + MachineBasicBlock::iterator BeginMBBI = MBB.begin(); MachineBasicBlock::iterator MBBI(MI); - int Offset; - MachineBasicBlock::iterator MergeInstr - = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset); - ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode); - if (Mode == ARM_AM::ia && Offset == -Bytes) { - Mode = ARM_AM::db; - } else if (Mode == ARM_AM::ib && Offset == -Bytes) { - Mode = ARM_AM::da; - } else { - MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset); - if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) && - ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes)) - return false; + if (MBBI != BeginMBBI) { + MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI); + while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue()) + --PrevMBBI; + if (Mode == ARM_AM::ia && + isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { + Mode = ARM_AM::db; + DoMerge = true; + } else if (Mode == ARM_AM::ib && + isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { + Mode = ARM_AM::da; + DoMerge = true; + } + if (DoMerge) + MBB.erase(PrevMBBI); } - MBB.erase(MergeInstr); + + // Try merging with the next instruction. + MachineBasicBlock::iterator EndMBBI = MBB.end(); + if (!DoMerge && MBBI != EndMBBI) { + MachineBasicBlock::iterator NextMBBI = std::next(MBBI); + while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) + ++NextMBBI; + if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) && + isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { + DoMerge = true; + } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) && + isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { + DoMerge = true; + } + if (DoMerge) + MBB.erase(NextMBBI); + } + + if (!DoMerge) + return false; unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode); MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)) @@ -1254,6 +1283,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { unsigned Base = getLoadStoreBaseOp(*MI).getReg(); bool BaseKill = getLoadStoreBaseOp(*MI).isKill(); + unsigned Bytes = getLSMultipleTransferSize(MI); unsigned Opcode = MI->getOpcode(); DebugLoc DL = MI->getDebugLoc(); bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS || @@ -1265,6 +1295,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0) return false; + bool isLd = isLoadSingle(Opcode); // Can't do the merge if the destination register is the same as the would-be // writeback register. if (MI->getOperand(0).getReg() == Base) @@ -1272,31 +1303,55 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); - int Bytes = getLSMultipleTransferSize(MI); + bool DoMerge = false; + ARM_AM::AddrOpc AddSub = ARM_AM::add; + unsigned NewOpc = 0; + // AM2 - 12 bits, thumb2 - 8 bits. + unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100); + + // Try merging with the previous instruction. MachineBasicBlock &MBB = *MI->getParent(); + MachineBasicBlock::iterator BeginMBBI = MBB.begin(); MachineBasicBlock::iterator MBBI(MI); - int Offset; - MachineBasicBlock::iterator MergeInstr - = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset); - unsigned NewOpc; - if (!isAM5 && Offset == Bytes) { - NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::add); - } else if (Offset == -Bytes) { - NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::sub); - } else { - MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset); - if (Offset == Bytes) { - NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add); - } else if (!isAM5 && Offset == -Bytes) { - NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::sub); - } else - return false; + if (MBBI != BeginMBBI) { + MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI); + while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue()) + --PrevMBBI; + if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) { + DoMerge = true; + AddSub = ARM_AM::sub; + } else if (!isAM5 && + isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) { + DoMerge = true; + } + if (DoMerge) { + NewOpc = getPreIndexedLoadStoreOpcode(Opcode, AddSub); + MBB.erase(PrevMBBI); + } } - MBB.erase(MergeInstr); - ARM_AM::AddrOpc AddSub = Offset < 0 ? ARM_AM::sub : ARM_AM::add; + // Try merging with the next instruction. + MachineBasicBlock::iterator EndMBBI = MBB.end(); + if (!DoMerge && MBBI != EndMBBI) { + MachineBasicBlock::iterator NextMBBI = std::next(MBBI); + while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) + ++NextMBBI; + if (!isAM5 && + isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) { + DoMerge = true; + AddSub = ARM_AM::sub; + } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) { + DoMerge = true; + } + if (DoMerge) { + NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub); + MBB.erase(NextMBBI); + } + } + + if (!DoMerge) + return false; - bool isLd = isLoadSingle(Opcode); if (isAM5) { // VLDM[SD]_UPD, VSTM[SD]_UPD // (There are no base-updating versions of VLDR/VSTR instructions, but the @@ -1313,16 +1368,18 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { if (isAM2) { // LDR_PRE, LDR_POST if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) { + int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) .addReg(Base, RegState::Define) .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); } else { - int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); + int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) .addReg(Base, RegState::Define) - .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg); + .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); } } else { + int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; // t2LDR_PRE, t2LDR_POST BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) .addReg(Base, RegState::Define) @@ -1334,12 +1391,13 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { // the vestigal zero-reg offset register. When that's fixed, this clause // can be removed entirely. if (isAM2 && NewOpc == ARM::STR_POST_IMM) { - int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); + int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); // STR_PRE, STR_POST BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base) .addReg(MO.getReg(), getKillRegState(MO.isKill())) - .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg); + .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); } else { + int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; // t2STR_PRE, t2STR_POST BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base) .addReg(MO.getReg(), getKillRegState(MO.isKill())) @@ -1351,66 +1409,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { return true; } -bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const { - unsigned Opcode = MI.getOpcode(); - assert((Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) && - "Must have t2STRDi8 or t2LDRDi8"); - if (MI.getOperand(3).getImm() != 0) - return false; - - // Behaviour for writeback is undefined if base register is the same as one - // of the others. - const MachineOperand &BaseOp = MI.getOperand(2); - unsigned Base = BaseOp.getReg(); - const MachineOperand &Reg0Op = MI.getOperand(0); - const MachineOperand &Reg1Op = MI.getOperand(1); - if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base) - return false; - - unsigned PredReg; - ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg); - MachineBasicBlock::iterator MBBI(MI); - MachineBasicBlock &MBB = *MI.getParent(); - int Offset; - MachineBasicBlock::iterator MergeInstr = findIncDecBefore(MBBI, Base, Pred, - PredReg, Offset); - unsigned NewOpc; - if (Offset == 8 || Offset == -8) { - NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_PRE : ARM::t2STRD_PRE; - } else { - MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset); - if (Offset == 8 || Offset == -8) { - NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST; - } else - return false; - } - MBB.erase(MergeInstr); - - DebugLoc DL = MI.getDebugLoc(); - MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)); - if (NewOpc == ARM::t2LDRD_PRE || NewOpc == ARM::t2LDRD_POST) { - MIB.addOperand(Reg0Op).addOperand(Reg1Op) - .addReg(BaseOp.getReg(), RegState::Define); - } else { - assert(NewOpc == ARM::t2STRD_PRE || NewOpc == ARM::t2STRD_POST); - MIB.addReg(BaseOp.getReg(), RegState::Define) - .addOperand(Reg0Op).addOperand(Reg1Op); - } - MIB.addReg(BaseOp.getReg(), RegState::Kill) - .addImm(Offset).addImm(Pred).addReg(PredReg); - assert(TII->get(Opcode).getNumOperands() == 6 && - TII->get(NewOpc).getNumOperands() == 7 && - "Unexpected number of operands in Opcode specification."); - - // Transfer implicit operands. - for (const MachineOperand &MO : MI.implicit_operands()) - MIB.addOperand(MO); - MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); - - MBB.erase(MBBI); - return true; -} - /// Returns true if instruction is a memory operation that this pass is capable /// of operating on. static bool isMemoryOp(const MachineInstr *MI) { @@ -1618,7 +1616,6 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { ARMCC::CondCodes CurrPred = ARMCC::AL; unsigned Position = 0; assert(Candidates.size() == 0); - assert(MergeBaseCandidates.size() == 0); LiveRegsValid = false; for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin(); @@ -1697,15 +1694,8 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { MBBI = I; --Position; // Fallthrough to look into existing chain. - } else if (MBBI->isDebugValue()) { + } else if (MBBI->isDebugValue()) continue; - } else if (MBBI->getOpcode() == ARM::t2LDRDi8 || - MBBI->getOpcode() == ARM::t2STRDi8) { - // ARMPreAllocLoadStoreOpt has already formed some LDRD/STRD instructions - // remember them because we may still be able to merge add/sub into them. - MergeBaseCandidates.push_back(MBBI); - } - // If we are here then the chain is broken; Extract candidates for a merge. if (MemOps.size() > 0) { @@ -1736,9 +1726,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { if (Merged) { Changed = true; unsigned Opcode = Merged->getOpcode(); - if (Opcode == ARM::t2STRDi8 || Opcode == ARM::t2LDRDi8) - MergeBaseUpdateLSDouble(*Merged); - else + if (Opcode != ARM::t2STRDi8 && Opcode != ARM::t2LDRDi8) MergeBaseUpdateLSMultiple(Merged); } else { for (MachineInstr *MI : Candidate->Instrs) { @@ -1753,10 +1741,6 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { } } Candidates.clear(); - // Try to fold add/sub into the LDRD/STRD formed by ARMPreAllocLoadStoreOpt. - for (MachineInstr *MI : MergeBaseCandidates) - MergeBaseUpdateLSDouble(*MI); - MergeBaseCandidates.clear(); return Changed; } diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp index 028119c..216e776 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -57,7 +57,7 @@ void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Some things to try that should be better: // * 'mov hi, $src; mov $dst, hi', with hi as either r10 or r11 // * 'movs $dst, $src' if cpsr isn't live - // See: http://lists.cs.uiuc.edu/pipermail/llvmdev/2014-August/075998.html + // See: http://lists.llvm.org/pipermail/llvm-dev/2014-August/075998.html // 'MOV lo, lo' is unpredictable on < v6, so use the stack to do it AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tPUSH))) diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp index 29283c8..21a8996 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -864,13 +864,13 @@ static bool needToReserveScavengingSpillSlots(MachineFunction &MF, // Check for an unused caller-saved register. for ( ; *CallerSavedRegs; ++CallerSavedRegs) { MCPhysReg FreeReg = *CallerSavedRegs; - if (!MRI.reg_nodbg_empty(FreeReg)) + if (MRI.isPhysRegUsed(FreeReg)) continue; // Check aliased register usage. bool IsCurrentRegUsed = false; for (MCRegAliasIterator AI(FreeReg, &HRI, false); AI.isValid(); ++AI) - if (!MRI.reg_nodbg_empty(*AI)) { + if (MRI.isPhysRegUsed(*AI)) { IsCurrentRegUsed = true; break; } diff --git a/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td index c37cf95..f917eca 100644 --- a/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td @@ -500,14 +500,6 @@ def : MipsPat<(trunc (assertzext GPR64:$src)), def : MipsPat<(i32 (trunc GPR64:$src)), (SLL (EXTRACT_SUBREG GPR64:$src, sub_32), 0)>; -// Bypass trunc nodes for bitwise ops. -def : MipsPat<(i32 (trunc (and GPR64:$lhs, GPR64:$rhs))), - (EXTRACT_SUBREG (AND64 GPR64:$lhs, GPR64:$rhs), sub_32)>; -def : MipsPat<(i32 (trunc (or GPR64:$lhs, GPR64:$rhs))), - (EXTRACT_SUBREG (OR64 GPR64:$lhs, GPR64:$rhs), sub_32)>; -def : MipsPat<(i32 (trunc (xor GPR64:$lhs, GPR64:$rhs))), - (EXTRACT_SUBREG (XOR64 GPR64:$lhs, GPR64:$rhs), sub_32)>; - // variable shift instructions patterns def : MipsPat<(shl GPR64:$rt, (i32 (trunc GPR64:$rs))), (DSLLV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>; diff --git a/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp b/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp index e2f6fcc..5152a07 100644 --- a/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp @@ -267,6 +267,9 @@ unsigned MipsFastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, } unsigned MipsFastISel::fastMaterializeAlloca(const AllocaInst *AI) { + if (!TargetSupported) + return 0; + assert(TLI.getValueType(DL, AI->getType(), true) == MVT::i32 && "Alloca should always return a pointer."); @@ -290,12 +293,7 @@ unsigned MipsFastISel::materializeInt(const Constant *C, MVT VT) { return 0; const TargetRegisterClass *RC = &Mips::GPR32RegClass; const ConstantInt *CI = cast<ConstantInt>(C); - int64_t Imm; - if ((VT != MVT::i1) && CI->isNegative()) - Imm = CI->getSExtValue(); - else - Imm = CI->getZExtValue(); - return materialize32BitInt(Imm, RC); + return materialize32BitInt(CI->getZExtValue(), RC); } unsigned MipsFastISel::materialize32BitInt(int64_t Imm, @@ -382,6 +380,9 @@ unsigned MipsFastISel::materializeExternalCallSym(MCSymbol *Sym) { // Materialize a constant into a register, and return the register // number (or zero if we failed to handle it). unsigned MipsFastISel::fastMaterializeConstant(const Constant *C) { + if (!TargetSupported) + return 0; + EVT CEVT = TLI.getValueType(DL, C->getType(), true); // Only handle simple types. @@ -981,6 +982,13 @@ bool MipsFastISel::selectSelect(const Instruction *I) { if (!Src1Reg || !Src2Reg || !CondReg) return false; + unsigned ZExtCondReg = createResultReg(&Mips::GPR32RegClass); + if (!ZExtCondReg) + return false; + + if (!emitIntExt(MVT::i1, CondReg, MVT::i32, ZExtCondReg, true)) + return false; + unsigned ResultReg = createResultReg(RC); unsigned TempReg = createResultReg(RC); @@ -989,7 +997,7 @@ bool MipsFastISel::selectSelect(const Instruction *I) { emitInst(TargetOpcode::COPY, TempReg).addReg(Src2Reg); emitInst(CondMovOpc, ResultReg) - .addReg(Src1Reg).addReg(CondReg).addReg(TempReg); + .addReg(Src1Reg).addReg(ZExtCondReg).addReg(TempReg); updateValueMap(I, ResultReg); return true; } @@ -1232,12 +1240,19 @@ bool MipsFastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT, } bool MipsFastISel::fastLowerCall(CallLoweringInfo &CLI) { + if (!TargetSupported) + return false; + CallingConv::ID CC = CLI.CallConv; bool IsTailCall = CLI.IsTailCall; bool IsVarArg = CLI.IsVarArg; const Value *Callee = CLI.Callee; MCSymbol *Symbol = CLI.Symbol; + // Do not handle FastCC. + if (CC == CallingConv::Fast) + return false; + // Allow SelectionDAG isel to handle tail calls. if (IsTailCall) return false; @@ -1312,6 +1327,9 @@ bool MipsFastISel::fastLowerCall(CallLoweringInfo &CLI) { } bool MipsFastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { + if (!TargetSupported) + return false; + switch (II->getIntrinsicID()) { default: return false; @@ -1415,6 +1433,11 @@ bool MipsFastISel::selectRet(const Instruction *I) { if (Ret->getNumOperands() > 0) { CallingConv::ID CC = F.getCallingConv(); + + // Do not handle FastCC. + if (CC == CallingConv::Fast) + return false; + SmallVector<ISD::OutputArg, 4> Outs; GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL); diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp index fbebb9a..fab2fdf 100644 --- a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -27,6 +27,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/CallingConv.h" @@ -53,11 +54,6 @@ NoZeroDivCheck("mno-check-zero-division", cl::Hidden, cl::desc("MIPS: Don't trap on integer division by zero."), cl::init(false)); -cl::opt<bool> -EnableMipsFastISel("mips-fast-isel", cl::Hidden, - cl::desc("Allow mips-fast-isel to be used"), - cl::init(false)); - static const MCPhysReg Mips64DPRegs[8] = { Mips::D12_64, Mips::D13_64, Mips::D14_64, Mips::D15_64, Mips::D16_64, Mips::D17_64, Mips::D18_64, Mips::D19_64 @@ -461,7 +457,7 @@ const MipsTargetLowering *MipsTargetLowering::create(const MipsTargetMachine &TM FastISel * MipsTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { - if (!EnableMipsFastISel) + if (!funcInfo.MF->getTarget().Options.EnableFastISel) return TargetLowering::createFastISel(funcInfo, libInfo); return Mips::createFastISel(funcInfo, libInfo); } diff --git a/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp index 4799ea2..93a503c 100644 --- a/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp +++ b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp @@ -12,6 +12,7 @@ #include "llvm/MC/MCFixedLenDisassembler.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/Endian.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; @@ -22,10 +23,12 @@ typedef MCDisassembler::DecodeStatus DecodeStatus; namespace { class PPCDisassembler : public MCDisassembler { + bool IsLittleEndian; + public: - PPCDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) - : MCDisassembler(STI, Ctx) {} - ~PPCDisassembler() override {} + PPCDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, + bool IsLittleEndian) + : MCDisassembler(STI, Ctx), IsLittleEndian(IsLittleEndian) {} DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, @@ -37,7 +40,13 @@ public: static MCDisassembler *createPPCDisassembler(const Target &T, const MCSubtargetInfo &STI, MCContext &Ctx) { - return new PPCDisassembler(STI, Ctx); + return new PPCDisassembler(STI, Ctx, /*IsLittleEndian=*/false); +} + +static MCDisassembler *createPPCLEDisassembler(const Target &T, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + return new PPCDisassembler(STI, Ctx, /*IsLittleEndian=*/true); } extern "C" void LLVMInitializePowerPCDisassembler() { @@ -47,7 +56,7 @@ extern "C" void LLVMInitializePowerPCDisassembler() { TargetRegistry::RegisterMCDisassembler(ThePPC64Target, createPPCDisassembler); TargetRegistry::RegisterMCDisassembler(ThePPC64LETarget, - createPPCDisassembler); + createPPCLEDisassembler); } // FIXME: These can be generated by TableGen from the existing register @@ -383,9 +392,9 @@ DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size, return MCDisassembler::Fail; } - // The instruction is big-endian encoded. - uint32_t Inst = - (Bytes[0] << 24) | (Bytes[1] << 16) | (Bytes[2] << 8) | (Bytes[3] << 0); + // Read the instruction in the proper endianness. + uint32_t Inst = IsLittleEndian ? support::endian::read32le(Bytes.data()) + : support::endian::read32be(Bytes.data()); if (STI.getFeatureBits()[PPC::FeatureQPX]) { DecodeStatus result = diff --git a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 199a0de..4444466 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -363,71 +363,85 @@ void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, SM.recordPatchPoint(MI); PatchPointOpers Opers(&MI); - int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm(); unsigned EncodedBytes = 0; - if (CallTarget) { - assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget && - "High 16 bits of call target should be zero."); - unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg(); - EncodedBytes = 0; - // Materialize the jump address: - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LI8) - .addReg(ScratchReg) - .addImm((CallTarget >> 32) & 0xFFFF)); - ++EncodedBytes; - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::RLDIC) - .addReg(ScratchReg) - .addReg(ScratchReg) - .addImm(32).addImm(16)); - ++EncodedBytes; - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORIS8) - .addReg(ScratchReg) - .addReg(ScratchReg) - .addImm((CallTarget >> 16) & 0xFFFF)); - ++EncodedBytes; - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORI8) - .addReg(ScratchReg) - .addReg(ScratchReg) - .addImm(CallTarget & 0xFFFF)); - - // Save the current TOC pointer before the remote call. - int TOCSaveOffset = Subtarget->isELFv2ABI() ? 24 : 40; - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::STD) - .addReg(PPC::X2) - .addImm(TOCSaveOffset) - .addReg(PPC::X1)); - ++EncodedBytes; - - - // If we're on ELFv1, then we need to load the actual function pointer from - // the function descriptor. - if (!Subtarget->isELFv2ABI()) { - // Load the new TOC pointer and the function address, but not r11 - // (needing this is rare, and loading it here would prevent passing it - // via a 'nest' parameter. - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD) + const MachineOperand &CalleeMO = + Opers.getMetaOper(PatchPointOpers::TargetPos); + + if (CalleeMO.isImm()) { + int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm(); + if (CallTarget) { + assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget && + "High 16 bits of call target should be zero."); + unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg(); + EncodedBytes = 0; + // Materialize the jump address: + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LI8) + .addReg(ScratchReg) + .addImm((CallTarget >> 32) & 0xFFFF)); + ++EncodedBytes; + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::RLDIC) + .addReg(ScratchReg) + .addReg(ScratchReg) + .addImm(32).addImm(16)); + ++EncodedBytes; + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORIS8) + .addReg(ScratchReg) + .addReg(ScratchReg) + .addImm((CallTarget >> 16) & 0xFFFF)); + ++EncodedBytes; + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORI8) + .addReg(ScratchReg) + .addReg(ScratchReg) + .addImm(CallTarget & 0xFFFF)); + + // Save the current TOC pointer before the remote call. + int TOCSaveOffset = Subtarget->isELFv2ABI() ? 24 : 40; + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::STD) .addReg(PPC::X2) - .addImm(8) + .addImm(TOCSaveOffset) + .addReg(PPC::X1)); + ++EncodedBytes; + + + // If we're on ELFv1, then we need to load the actual function pointer + // from the function descriptor. + if (!Subtarget->isELFv2ABI()) { + // Load the new TOC pointer and the function address, but not r11 + // (needing this is rare, and loading it here would prevent passing it + // via a 'nest' parameter. + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD) + .addReg(PPC::X2) + .addImm(8) + .addReg(ScratchReg)); + ++EncodedBytes; + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD) + .addReg(ScratchReg) + .addImm(0) + .addReg(ScratchReg)); + ++EncodedBytes; + } + + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR8) .addReg(ScratchReg)); ++EncodedBytes; + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTRL8)); + ++EncodedBytes; + + // Restore the TOC pointer after the call. EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD) - .addReg(ScratchReg) - .addImm(0) - .addReg(ScratchReg)); + .addReg(PPC::X2) + .addImm(TOCSaveOffset) + .addReg(PPC::X1)); ++EncodedBytes; } + } else if (CalleeMO.isGlobal()) { + const GlobalValue *GValue = CalleeMO.getGlobal(); + MCSymbol *MOSymbol = getSymbol(GValue); + const MCExpr *SymVar = MCSymbolRefExpr::create(MOSymbol, OutContext); - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR8).addReg(ScratchReg)); - ++EncodedBytes; - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTRL8)); - ++EncodedBytes; - - // Restore the TOC pointer after the call. - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD) - .addReg(PPC::X2) - .addImm(TOCSaveOffset) - .addReg(PPC::X1)); - ++EncodedBytes; + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BL8_NOP) + .addExpr(SymVar)); + EncodedBytes += 2; } // Each instruction is 4 bytes. diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index 87229d8..08ae717 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -306,10 +306,9 @@ static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) { const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); DebugLoc dl = MI->getDebugLoc(); - const MachineRegisterInfo &MRI = MF->getRegInfo(); unsigned UsedRegMask = 0; for (unsigned i = 0; i != 32; ++i) - if (MRI.isPhysRegModified(VRRegNo[i])) + if (MF->getRegInfo().isPhysRegUsed(VRRegNo[i])) UsedRegMask |= 1 << (31-i); // Live in and live out values already must be in the mask, so don't bother diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 01a3acb..b6025bf 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -2305,14 +2305,15 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) { if (Swap) std::swap(LHS, RHS); + EVT ResVT = VecVT.changeVectorElementTypeToInteger(); if (Negate) { - SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, VecVT, LHS, RHS), 0); + SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, ResVT, LHS, RHS), 0); return CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLNOR : PPC::VNOR, - VecVT, VCmp, VCmp); + ResVT, VCmp, VCmp); } - return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, LHS, RHS); + return CurDAG->SelectNodeTo(N, VCmpInst, ResVT, LHS, RHS); } if (PPCSubTarget->useCRBits()) diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 0ed9b05..1e28913 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -580,6 +580,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, addRegisterClass(MVT::f64, &PPC::VSFRCRegClass); + addRegisterClass(MVT::v4i32, &PPC::VSRCRegClass); addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass); addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass); @@ -1416,7 +1417,7 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, } else return -1; - if (ShuffleKind == 2 && isLE) + if (isLE) ShiftAmt = 16 - ShiftAmt; return ShiftAmt; @@ -1429,6 +1430,11 @@ bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) { assert(N->getValueType(0) == MVT::v16i8 && (EltSize == 1 || EltSize == 2 || EltSize == 4)); + // The consecutive indices need to specify an element, not part of two + // different elements. So abandon ship early if this isn't the case. + if (N->getMaskElt(0) % EltSize != 0) + return false; + // This is a splat operation if each element of the permute is the same, and // if the value doesn't reference the second vector. unsigned ElementBase = N->getMaskElt(0); @@ -7011,17 +7017,20 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, // t = vsplti c, result = vsldoi t, t, 1 if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) { SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); - return BuildVSLDOI(T, T, 1, Op.getValueType(), DAG, dl); + unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1; + return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); } // t = vsplti c, result = vsldoi t, t, 2 if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) { SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); - return BuildVSLDOI(T, T, 2, Op.getValueType(), DAG, dl); + unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2; + return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); } // t = vsplti c, result = vsldoi t, t, 3 if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) { SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); - return BuildVSLDOI(T, T, 3, Op.getValueType(), DAG, dl); + unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3; + return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); } } @@ -9957,6 +9966,9 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, if (Src.getValueType() == MVT::f32) { Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); DCI.AddToWorklist(Src.getNode()); + } else if (Src.getValueType() != MVT::f64) { + // Make sure that we don't pick up a ppc_fp128 source value. + return SDValue(); } unsigned FCTOp = diff --git a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp index 8fa10dc..c0279da 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp +++ b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp @@ -190,11 +190,11 @@ static bool LLVM_ATTRIBUTE_UNUSED verifyLeafProcRegUse(MachineRegisterInfo *MRI) { for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) - if (!MRI->reg_nodbg_empty(reg)) + if (MRI->isPhysRegUsed(reg)) return false; for (unsigned reg = SP::L0; reg <= SP::L7; ++reg) - if (!MRI->reg_nodbg_empty(reg)) + if (MRI->isPhysRegUsed(reg)) return false; return true; @@ -206,10 +206,10 @@ bool SparcFrameLowering::isLeafProc(MachineFunction &MF) const MachineRegisterInfo &MRI = MF.getRegInfo(); MachineFrameInfo *MFI = MF.getFrameInfo(); - return !(MFI->hasCalls() // has calls - || !MRI.reg_nodbg_empty(SP::L0) // Too many registers needed - || !MRI.reg_nodbg_empty(SP::O6) // %SP is used - || hasFP(MF)); // need %FP + return !(MFI->hasCalls() // has calls + || MRI.isPhysRegUsed(SP::L0) // Too many registers needed + || MRI.isPhysRegUsed(SP::O6) // %SP is used + || hasFP(MF)); // need %FP } void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const { @@ -218,13 +218,16 @@ void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const { // Remap %i[0-7] to %o[0-7]. for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) { - if (MRI.reg_nodbg_empty(reg)) + if (!MRI.isPhysRegUsed(reg)) continue; unsigned mapped_reg = (reg - SP::I0 + SP::O0); - assert(MRI.reg_nodbg_empty(mapped_reg)); + assert(!MRI.isPhysRegUsed(mapped_reg)); // Replace I register with O register. MRI.replaceRegWith(reg, mapped_reg); + + // Mark the reg unused. + MRI.setPhysRegUnused(reg); } // Rewrite MBB's Live-ins. diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.td index be8f00b..bdd1b15 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.td +++ b/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.td @@ -53,10 +53,6 @@ def RetCC_SystemZ : CallingConv<[ CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>> - - // ABI-compliant code returns long double by reference, but that conversion - // is left to higher-level code. Perhaps we could add an f128 definition - // here for code that doesn't care about the ABI? ]>; //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 056ee02..9a753c8 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1175,6 +1175,20 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI, return Chain; } +bool SystemZTargetLowering:: +CanLowerReturn(CallingConv::ID CallConv, + MachineFunction &MF, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const { + // Detect unsupported vector return types. + if (Subtarget.hasVector()) + VerifyVectorTypes(Outs); + + SmallVector<CCValAssign, 16> RetLocs; + CCState RetCCInfo(CallConv, isVarArg, MF, RetLocs, Context); + return RetCCInfo.CheckReturn(Outs, RetCC_SystemZ); +} + SDValue SystemZTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h index 949b67f..07ff251 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -423,6 +423,10 @@ public: SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const override; + bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const override; SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index 418f043..bca059d 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -681,6 +681,9 @@ private: std::unique_ptr<X86Operand> DefaultMemSIOperand(SMLoc Loc); std::unique_ptr<X86Operand> DefaultMemDIOperand(SMLoc Loc); + void AddDefaultSrcDestOperands( + OperandVector& Operands, std::unique_ptr<llvm::MCParsedAsmOperand> &&Src, + std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst); std::unique_ptr<X86Operand> ParseOperand(); std::unique_ptr<X86Operand> ParseATTOperand(); std::unique_ptr<X86Operand> ParseIntelOperand(); @@ -1014,6 +1017,19 @@ std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) { Loc, Loc, 0); } +void X86AsmParser::AddDefaultSrcDestOperands( + OperandVector& Operands, std::unique_ptr<llvm::MCParsedAsmOperand> &&Src, + std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst) { + if (isParsingIntelSyntax()) { + Operands.push_back(std::move(Dst)); + Operands.push_back(std::move(Src)); + } + else { + Operands.push_back(std::move(Src)); + Operands.push_back(std::move(Dst)); + } +} + std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() { if (isParsingIntelSyntax()) return ParseIntelOperand(); @@ -2228,26 +2244,18 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, if (Name.startswith("ins") && Operands.size() == 1 && (Name == "insb" || Name == "insw" || Name == "insl" || Name == "insd" )) { - if (isParsingIntelSyntax()) { - Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc)); - Operands.push_back(DefaultMemDIOperand(NameLoc)); - } else { - Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc)); - Operands.push_back(DefaultMemDIOperand(NameLoc)); - } + AddDefaultSrcDestOperands(Operands, + X86Operand::CreateReg(X86::DX, NameLoc, NameLoc), + DefaultMemDIOperand(NameLoc)); } // Append default arguments to "outs[bwld]" if (Name.startswith("outs") && Operands.size() == 1 && (Name == "outsb" || Name == "outsw" || Name == "outsl" || Name == "outsd" )) { - if (isParsingIntelSyntax()) { - Operands.push_back(DefaultMemSIOperand(NameLoc)); - Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc)); - } else { - Operands.push_back(DefaultMemSIOperand(NameLoc)); - Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc)); - } + AddDefaultSrcDestOperands(Operands, + DefaultMemSIOperand(NameLoc), + X86Operand::CreateReg(X86::DX, NameLoc, NameLoc)); } // Transform "lods[bwlq]" into "lods[bwlq] ($SIREG)" for appropriate @@ -2279,13 +2287,9 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, (Name == "cmps" || Name == "cmpsb" || Name == "cmpsw" || Name == "cmpsl" || Name == "cmpsd" || Name == "cmpsq")) { if (Operands.size() == 1) { - if (isParsingIntelSyntax()) { - Operands.push_back(DefaultMemSIOperand(NameLoc)); - Operands.push_back(DefaultMemDIOperand(NameLoc)); - } else { - Operands.push_back(DefaultMemDIOperand(NameLoc)); - Operands.push_back(DefaultMemSIOperand(NameLoc)); - } + AddDefaultSrcDestOperands(Operands, + DefaultMemDIOperand(NameLoc), + DefaultMemSIOperand(NameLoc)); } else if (Operands.size() == 3) { X86Operand &Op = (X86Operand &)*Operands[1]; X86Operand &Op2 = (X86Operand &)*Operands[2]; @@ -2305,13 +2309,9 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, if (Operands.size() == 1) { if (Name == "movsd") Operands.back() = X86Operand::CreateToken("movsl", NameLoc); - if (isParsingIntelSyntax()) { - Operands.push_back(DefaultMemDIOperand(NameLoc)); - Operands.push_back(DefaultMemSIOperand(NameLoc)); - } else { - Operands.push_back(DefaultMemSIOperand(NameLoc)); - Operands.push_back(DefaultMemDIOperand(NameLoc)); - } + AddDefaultSrcDestOperands(Operands, + DefaultMemSIOperand(NameLoc), + DefaultMemDIOperand(NameLoc)); } else if (Operands.size() == 3) { X86Operand &Op = (X86Operand &)*Operands[1]; X86Operand &Op2 = (X86Operand &)*Operands[2]; diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp index 36a8cdb..40b9c8a 100644 --- a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp +++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -301,9 +301,8 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { bool FPIsUsed = false; static_assert(X86::FP6 == X86::FP0+6, "Register enums aren't sorted right!"); - const MachineRegisterInfo &MRI = MF.getRegInfo(); for (unsigned i = 0; i <= 6; ++i) - if (!MRI.reg_nodbg_empty(X86::FP0 + i)) { + if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) { FPIsUsed = true; break; } diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp index 2a35c4c..3a21b57 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -1682,6 +1682,8 @@ void X86FrameLowering::adjustForSegmentedStacks( .addImm(StackSize); BuildMI(allocMBB, DL, TII.get(MOVri), Reg11) .addImm(X86FI->getArgumentStackSize()); + MF.getRegInfo().setPhysRegUsed(Reg10); + MF.getRegInfo().setPhysRegUsed(Reg11); } else { BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) .addImm(X86FI->getArgumentStackSize()); diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp index 6e22ab3..71ccb1a 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12640,24 +12640,29 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { if (User->getOpcode() == ISD::FNEG) return Op; - SDValue Op0 = Op.getOperand(0); - bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); - SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); - // Assume scalar op for initialization; update for vector if needed. - // Note that there are no scalar bitwise logical SSE/AVX instructions, so we - // generate a 16-byte vector constant and logic op even for the scalar case. - // Using a 16-byte mask allows folding the load of the mask with - // the logic op, so it can save (~4 bytes) on code size. - MVT EltVT = VT; - unsigned NumElts = VT == MVT::f64 ? 2 : 4; + // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to // decide if we should generate a 16-byte constant mask when we only need 4 or // 8 bytes for the scalar case. + + MVT LogicVT; + MVT EltVT; + unsigned NumElts; + if (VT.isVector()) { + LogicVT = VT; EltVT = VT.getVectorElementType(); NumElts = VT.getVectorNumElements(); + } else { + // There are no scalar bitwise logical SSE/AVX instructions, so we + // generate a 16-byte vector constant and logic op even for the scalar case. + // Using a 16-byte mask allows folding the load of the mask with + // the logic op, so it can save (~4 bytes) on code size. + LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; + EltVT = VT; + NumElts = (VT == MVT::f64) ? 2 : 4; } unsigned EltBits = EltVT.getSizeInBits(); @@ -12670,26 +12675,25 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); - SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + SDValue Mask = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), false, false, false, Alignment); - if (VT.isVector()) { - // For a vector, cast operands to a vector type, perform the logic op, - // and cast the result back to the original value type. - MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); - SDValue MaskCasted = DAG.getBitcast(VecVT, Mask); - SDValue Operand = IsFNABS ? DAG.getBitcast(VecVT, Op0.getOperand(0)) - : DAG.getBitcast(VecVT, Op0); - unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR; - return DAG.getBitcast(VT, - DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted)); - } - - // If not vector, then scalar. - unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; + SDValue Op0 = Op.getOperand(0); + bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); + unsigned LogicOp = + IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; - return DAG.getNode(BitOp, dl, VT, Operand, Mask); + + if (VT.isVector()) + return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); + + // For the scalar case extend to a 128-bit vector, perform the logic op, + // and extract the scalar result back out. + Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand); + SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode, + DAG.getIntPtrConstant(0, dl)); } static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { @@ -12729,10 +12733,16 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { Constant *C = ConstantVector::get(CV); auto PtrVT = TLI.getPointerTy(DAG.getDataLayout()); SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16); - SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, + + // Perform all logic operations as 16-byte vectors because there are no + // scalar FP logic instructions in SSE. This allows load folding of the + // constants into the logic instructions. + MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; + SDValue Mask1 = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), false, false, false, 16); - SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); + Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1); + SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1); // Next, clear the sign bit from the first operand (magnitude). // If it's a constant, we can clear it here. @@ -12740,7 +12750,8 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { APFloat APF = Op0CN->getValueAPF(); // If the magnitude is a positive zero, the sign bit alone is enough. if (APF.isPosZero()) - return SignBit; + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit, + DAG.getIntPtrConstant(0, dl)); APF.clearSign(); CV[0] = ConstantFP::get(*Context, APF); } else { @@ -12750,15 +12761,18 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { } C = ConstantVector::get(CV); CPIdx = DAG.getConstantPool(C, PtrVT, 16); - SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + SDValue Val = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), false, false, false, 16); // If the magnitude operand wasn't a constant, we need to AND out the sign. - if (!isa<ConstantFPSDNode>(Op0)) - Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val); - + if (!isa<ConstantFPSDNode>(Op0)) { + Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0); + Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val); + } // OR the magnitude value with the sign bit. - return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); + Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val, + DAG.getIntPtrConstant(0, dl)); } static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp index 78615076..cf68ef0 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -956,18 +956,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 }, { X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 }, - // FIXME: We should not be folding Fs* scalar loads into vector - // instructions because the vector instructions require vector-sized - // loads. Lowering should create vector-sized instructions (the Fv* - // variants below) to allow load folding. - { X86::FsANDNPDrr, X86::FsANDNPDrm, TB_ALIGN_16 }, - { X86::FsANDNPSrr, X86::FsANDNPSrm, TB_ALIGN_16 }, - { X86::FsANDPDrr, X86::FsANDPDrm, TB_ALIGN_16 }, - { X86::FsANDPSrr, X86::FsANDPSrm, TB_ALIGN_16 }, - { X86::FsORPDrr, X86::FsORPDrm, TB_ALIGN_16 }, - { X86::FsORPSrr, X86::FsORPSrm, TB_ALIGN_16 }, - { X86::FsXORPDrr, X86::FsXORPDrm, TB_ALIGN_16 }, - { X86::FsXORPSrr, X86::FsXORPSrm, TB_ALIGN_16 }, + // Do not fold Fs* scalar logical op loads because there are no scalar + // load variants for these instructions. When folded, the load is required + // to be 128-bits, so the load size would not match. { X86::FvANDNPDrr, X86::FvANDNPDrm, TB_ALIGN_16 }, { X86::FvANDNPSrr, X86::FvANDNPSrm, TB_ALIGN_16 }, diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td index a5ff9ed..99386b0 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td @@ -2919,6 +2919,14 @@ multiclass sse12_fp_packed_vector_logical_alias< defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins, 0>, PD, VEX_4V; + + defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, + VR256, v8f32, f256mem, loadv8f32, SSEPackedSingle, itins, 0>, + PS, VEX_4V, VEX_L; + + defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, + VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins, 0>, + PD, VEX_4V, VEX_L; } let Constraints = "$src1 = $dst" in { diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index ee21c81..15e0889 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -93,7 +93,8 @@ static Value *getFCmpValue(bool isordered, unsigned code, case 5: Pred = isordered ? FCmpInst::FCMP_ONE : FCmpInst::FCMP_UNE; break; case 6: Pred = isordered ? FCmpInst::FCMP_OLE : FCmpInst::FCMP_ULE; break; case 7: - if (!isordered) return ConstantInt::getTrue(LHS->getContext()); + if (!isordered) + return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1); Pred = FCmpInst::FCMP_ORD; break; } return Builder->CreateFCmp(Pred, LHS, RHS); diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 0bd6fd2..95bba3c 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -2112,9 +2112,8 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B, bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS, Value *RHS, Instruction &OrigI, Value *&Result, Constant *&Overflow) { - assert((!OrigI.isCommutative() || - !(isa<Constant>(LHS) && !isa<Constant>(RHS))) && - "call with a constant RHS if possible!"); + if (OrigI.isCommutative() && isa<Constant>(LHS) && !isa<Constant>(RHS)) + std::swap(LHS, RHS); auto SetResult = [&](Value *OpResult, Constant *OverflowVal, bool ReuseName) { Result = OpResult; diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index d536a93..029b44c 100644 --- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -658,7 +658,7 @@ bool EarlyCSE::run() { // gains over vector when the container becomes very large due to the // specific access patterns. For more information see the mailing list // discussion on this: - // http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html + // http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html std::deque<StackNode *> nodesToProcess; bool Changed = false; diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp index d1a0a82..947513a 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp @@ -1847,10 +1847,17 @@ static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset, static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { if (OldTy == NewTy) return true; - if (IntegerType *OldITy = dyn_cast<IntegerType>(OldTy)) - if (IntegerType *NewITy = dyn_cast<IntegerType>(NewTy)) - if (NewITy->getBitWidth() >= OldITy->getBitWidth()) - return true; + + // For integer types, we can't handle any bit-width differences. This would + // break both vector conversions with extension and introduce endianness + // issues when in conjunction with loads and stores. + if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) { + assert(cast<IntegerType>(OldTy)->getBitWidth() != + cast<IntegerType>(NewTy)->getBitWidth() && + "We can't have the same bitwidth for different int types"); + return false; + } + if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy)) return false; if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) @@ -1885,10 +1892,8 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, if (OldTy == NewTy) return V; - if (IntegerType *OldITy = dyn_cast<IntegerType>(OldTy)) - if (IntegerType *NewITy = dyn_cast<IntegerType>(NewTy)) - if (NewITy->getBitWidth() > OldITy->getBitWidth()) - return IRB.CreateZExt(V, NewITy); + assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) && + "Integer types must be the exact same to convert."); // See if we need inttoptr for this type pair. A cast involving both scalars // and vectors requires and additional bitcast. @@ -2134,6 +2139,9 @@ static bool isIntegerWideningViableForSlice(const Slice &S, if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { if (LI->isVolatile()) return false; + // We can't handle loads that extend past the allocated memory. + if (DL.getTypeStoreSize(LI->getType()) > Size) + return false; // Note that we don't count vector loads or stores as whole-alloca // operations which enable integer widening because we would prefer to use // vector widening instead. @@ -2152,6 +2160,9 @@ static bool isIntegerWideningViableForSlice(const Slice &S, Type *ValueTy = SI->getValueOperand()->getType(); if (SI->isVolatile()) return false; + // We can't handle stores that extend past the allocated memory. + if (DL.getTypeStoreSize(ValueTy) > Size) + return false; // Note that we don't count vector loads or stores as whole-alloca // operations which enable integer widening because we would prefer to use // vector widening instead. @@ -2585,6 +2596,7 @@ private: Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8) : LI.getType(); + const bool IsLoadPastEnd = DL.getTypeStoreSize(TargetTy) > SliceSize; bool IsPtrAdjusted = false; Value *V; if (VecTy) { @@ -2592,13 +2604,27 @@ private: } else if (IntTy && LI.getType()->isIntegerTy()) { V = rewriteIntegerLoad(LI); } else if (NewBeginOffset == NewAllocaBeginOffset && - canConvertValue(DL, NewAllocaTy, LI.getType())) { + NewEndOffset == NewAllocaEndOffset && + (canConvertValue(DL, NewAllocaTy, TargetTy) || + (IsLoadPastEnd && NewAllocaTy->isIntegerTy() && + TargetTy->isIntegerTy()))) { LoadInst *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), LI.isVolatile(), LI.getName()); if (LI.isVolatile()) NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope()); - V = NewLI; + + // If this is an integer load past the end of the slice (which means the + // bytes outside the slice are undef or this load is dead) just forcibly + // fix the integer size with correct handling of endianness. + if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy)) + if (auto *TITy = dyn_cast<IntegerType>(TargetTy)) + if (AITy->getBitWidth() < TITy->getBitWidth()) { + V = IRB.CreateZExt(V, TITy, "load.ext"); + if (DL.isBigEndian()) + V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(), + "endian_shift"); + } } else { Type *LTy = TargetTy->getPointerTo(); LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy), @@ -2718,10 +2744,25 @@ private: if (IntTy && V->getType()->isIntegerTy()) return rewriteIntegerStore(V, SI); + const bool IsStorePastEnd = DL.getTypeStoreSize(V->getType()) > SliceSize; StoreInst *NewSI; if (NewBeginOffset == NewAllocaBeginOffset && NewEndOffset == NewAllocaEndOffset && - canConvertValue(DL, V->getType(), NewAllocaTy)) { + (canConvertValue(DL, V->getType(), NewAllocaTy) || + (IsStorePastEnd && NewAllocaTy->isIntegerTy() && + V->getType()->isIntegerTy()))) { + // If this is an integer store past the end of slice (and thus the bytes + // past that point are irrelevant or this is unreachable), truncate the + // value prior to storing. + if (auto *VITy = dyn_cast<IntegerType>(V->getType())) + if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy)) + if (VITy->getBitWidth() > AITy->getBitWidth()) { + if (DL.isBigEndian()) + V = IRB.CreateLShr(V, VITy->getBitWidth() - AITy->getBitWidth(), + "endian_shift"); + V = IRB.CreateTrunc(V, AITy, "load.trunc"); + } + V = convertValue(DL, IRB, V, NewAllocaTy); NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), SI.isVolatile()); diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp index d55dc6a..0493003 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -227,10 +227,16 @@ Value *Scatterer::operator[](unsigned I) { if (!Idx) break; unsigned J = Idx->getZExtValue(); - CV[J] = Insert->getOperand(1); V = Insert->getOperand(0); - if (I == J) + if (I == J) { + CV[J] = Insert->getOperand(1); return CV[J]; + } else if (!CV[J]) { + // Only cache the first entry we find for each index we're not actively + // searching for. This prevents us from going too far up the chain and + // caching incorrect entries. + CV[J] = Insert->getOperand(1); + } } CV[I] = Builder.CreateExtractElement(V, Builder.getInt32(I), V->getName() + ".i" + Twine(I)); |