diff options
author | dim <dim@FreeBSD.org> | 2011-06-12 15:42:51 +0000 |
---|---|---|
committer | dim <dim@FreeBSD.org> | 2011-06-12 15:42:51 +0000 |
commit | ece02cd5829cea836e9365b0845a8ef042d17b0a (patch) | |
tree | b3032e51d630e8070e9e08d6641648f195316a80 /lib/Target/X86 | |
parent | 2b066988909948dc3d53d01760bc2d71d32f3feb (diff) | |
download | FreeBSD-src-ece02cd5829cea836e9365b0845a8ef042d17b0a.zip FreeBSD-src-ece02cd5829cea836e9365b0845a8ef042d17b0a.tar.gz |
Vendor import of llvm trunk r132879:
http://llvm.org/svn/llvm-project/llvm/trunk@132879
Diffstat (limited to 'lib/Target/X86')
27 files changed, 1196 insertions, 1048 deletions
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index dd6e353..68247d2 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -41,6 +41,11 @@ X86ATTInstPrinter::X86ATTInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI) &TM.getSubtarget<X86Subtarget>())); } +void X86ATTInstPrinter::printRegName(raw_ostream &OS, + unsigned RegNo) const { + OS << '%' << getRegisterName(RegNo); +} + void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS) { // Try to print any aliases first. if (!printAliasInstr(MI, OS)) diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h index 8d69391..5f939b6 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h @@ -26,6 +26,7 @@ class X86ATTInstPrinter : public MCInstPrinter { public: X86ATTInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI); + virtual void printRegName(raw_ostream &OS, unsigned RegNo) const; virtual void printInst(const MCInst *MI, raw_ostream &OS); virtual StringRef getOpcodeName(unsigned Opcode) const; diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp index 47253eb..5f581ba 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp @@ -29,6 +29,10 @@ using namespace llvm; #define GET_INSTRUCTION_NAME #include "X86GenAsmWriter1.inc" +void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + OS << getRegisterName(RegNo); +} + void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS) { printInstruction(MI, OS); diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h index ca99dc0..c8030c3 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h @@ -27,6 +27,7 @@ public: X86IntelInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI) : MCInstPrinter(MAI) {} + virtual void printRegName(raw_ostream &OS, unsigned RegNo) const; virtual void printInst(const MCInst *MI, raw_ostream &OS); virtual StringRef getOpcodeName(unsigned Opcode) const; diff --git a/lib/Target/X86/README-X86-64.txt b/lib/Target/X86/README-X86-64.txt index e7429a3..bcfdf0b 100644 --- a/lib/Target/X86/README-X86-64.txt +++ b/lib/Target/X86/README-X86-64.txt @@ -124,51 +124,6 @@ if we have whole-function selectiondags. //===---------------------------------------------------------------------===// -Take the following C code -(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43640): - -struct u1 -{ - float x; - float y; -}; - -float foo(struct u1 u) -{ - return u.x + u.y; -} - -Optimizes to the following IR: -define float @foo(double %u.0) nounwind readnone { -entry: - %tmp8 = bitcast double %u.0 to i64 ; <i64> [#uses=2] - %tmp6 = trunc i64 %tmp8 to i32 ; <i32> [#uses=1] - %tmp7 = bitcast i32 %tmp6 to float ; <float> [#uses=1] - %tmp2 = lshr i64 %tmp8, 32 ; <i64> [#uses=1] - %tmp3 = trunc i64 %tmp2 to i32 ; <i32> [#uses=1] - %tmp4 = bitcast i32 %tmp3 to float ; <float> [#uses=1] - %0 = fadd float %tmp7, %tmp4 ; <float> [#uses=1] - ret float %0 -} - -And current llvm-gcc/clang output: - movd %xmm0, %rax - movd %eax, %xmm1 - shrq $32, %rax - movd %eax, %xmm0 - addss %xmm1, %xmm0 - ret - -We really shouldn't move the floats to RAX, only to immediately move them -straight back to the XMM registers. - -There really isn't any good way to handle this purely in IR optimizers; it -could possibly be handled by changing the output of the fronted, though. It -would also be feasible to add a x86-specific DAGCombine to optimize the -bitcast+trunc+(lshr+)bitcast combination. - -//===---------------------------------------------------------------------===// - Take the following code (from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34653): extern unsigned long table[]; diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index ea3014e..560947a 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -2040,3 +2040,29 @@ _clamp2: ## @clamp2 The move of 0 could be scheduled above the test to make it is xor reg,reg. //===---------------------------------------------------------------------===// + +GCC PR48986. We currently compile this: + +void bar(void); +void yyy(int* p) { + if (__sync_fetch_and_add(p, -1) == 1) + bar(); +} + +into: + movl $-1, %eax + lock + xaddl %eax, (%rdi) + cmpl $1, %eax + je LBB0_2 + +Instead we could generate: + + lock + dec %rdi + je LBB0_2 + +The trick is to match "fetch_and_add(X, -C) == C". + +//===---------------------------------------------------------------------===// + diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 25b8d3e..7bb9676 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -101,8 +101,10 @@ def : Proc<"i686", []>; def : Proc<"pentiumpro", [FeatureCMOV]>; def : Proc<"pentium2", [FeatureMMX, FeatureCMOV]>; def : Proc<"pentium3", [FeatureSSE1]>; +def : Proc<"pentium3m", [FeatureSSE1, FeatureSlowBTMem]>; def : Proc<"pentium-m", [FeatureSSE2, FeatureSlowBTMem]>; def : Proc<"pentium4", [FeatureSSE2]>; +def : Proc<"pentium4m", [FeatureSSE2, FeatureSlowBTMem]>; def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem]>; def : Proc<"yonah", [FeatureSSE3, FeatureSlowBTMem]>; def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>; @@ -122,7 +124,7 @@ def : Proc<"westmere", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem, // SSE is not listed here since llvm treats AVX as a reimplementation of SSE, // rather than a superset. // FIXME: Disabling AVX for now since it's not ready. -def : Proc<"sandybridge", [FeatureSSE42, Feature64Bit, +def : Proc<"corei7-avx", [FeatureSSE42, Feature64Bit, FeatureAES, FeatureCLMUL]>; def : Proc<"k6", [FeatureMMX]>; diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 1382f18..f1b9972 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -108,11 +108,11 @@ private: bool X86SelectFPExt(const Instruction *I); bool X86SelectFPTrunc(const Instruction *I); - bool X86SelectExtractValue(const Instruction *I); - bool X86VisitIntrinsicCall(const IntrinsicInst &I); bool X86SelectCall(const Instruction *I); + bool DoSelectCall(const Instruction *I, const char *MemIntName); + const X86InstrInfo *getInstrInfo() const { return getTargetMachine()->getInstrInfo(); } @@ -135,6 +135,8 @@ private: bool isTypeLegal(const Type *Ty, MVT &VT, bool AllowI1 = false); + bool IsMemcpySmall(uint64_t Len); + bool TryEmitSmallMemcpy(X86AddressMode DestAM, X86AddressMode SrcAM, uint64_t Len); }; @@ -401,7 +403,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue()); continue; } - + // A array/variable index is always of the form i*S where S is the // constant scale size. See if we can push the scale into immediates. uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType()); @@ -469,7 +471,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) if (GVar->isThreadLocal()) return false; - + // RIP-relative addresses can't have additional register operands, so if // we've already folded stuff into the addressing mode, just force the // global value into its own register, which we can use as the basereg. @@ -704,7 +706,8 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ValLocs; - CCState CCInfo(CC, F.isVarArg(), TM, ValLocs, I->getContext()); + CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs, + I->getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); const Value *RV = Ret->getOperand(0); @@ -724,18 +727,38 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { // Only handle register returns for now. if (!VA.isRegLoc()) return false; - // TODO: For now, don't try to handle cases where getLocInfo() - // says Full but the types don't match. - if (TLI.getValueType(RV->getType()) != VA.getValVT()) - return false; // The calling-convention tables for x87 returns don't tell // the whole story. if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) return false; - // Make the copy. unsigned SrcReg = Reg + VA.getValNo(); + EVT SrcVT = TLI.getValueType(RV->getType()); + EVT DstVT = VA.getValVT(); + // Special handling for extended integers. + if (SrcVT != DstVT) { + if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16) + return false; + + if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt()) + return false; + + assert(DstVT == MVT::i32 && "X86 should always ext to i32"); + + if (SrcVT == MVT::i1) { + if (Outs[0].Flags.isSExt()) + return false; + SrcReg = FastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false); + SrcVT = MVT::i8; + } + unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND : + ISD::SIGN_EXTEND; + SrcReg = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, + SrcReg, /*TODO: Kill=*/false); + } + + // Make the copy. unsigned DstReg = VA.getLocReg(); const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg); // Avoid a cross-class copy. This is very unlikely. @@ -916,18 +939,31 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { bool X86FastISel::X86SelectZExt(const Instruction *I) { // Handle zero-extension from i1 to i8, which is common. - if (I->getType()->isIntegerTy(8) && - I->getOperand(0)->getType()->isIntegerTy(1)) { - unsigned ResultReg = getRegForValue(I->getOperand(0)); - if (ResultReg == 0) return false; - // Set the high bits to zero. - ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); - if (ResultReg == 0) return false; - UpdateValueMap(I, ResultReg); - return true; + if (!I->getOperand(0)->getType()->isIntegerTy(1)) + return false; + + EVT DstVT = TLI.getValueType(I->getType()); + if (!TLI.isTypeLegal(DstVT)) + return false; + + unsigned ResultReg = getRegForValue(I->getOperand(0)); + if (ResultReg == 0) + return false; + + // Set the high bits to zero. + ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); + if (ResultReg == 0) + return false; + + if (DstVT != MVT::i8) { + ResultReg = FastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND, + ResultReg, /*Kill=*/true); + if (ResultReg == 0) + return false; } - return false; + UpdateValueMap(I, ResultReg); + return true; } @@ -1010,63 +1046,6 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { FuncInfo.MBB->addSuccessor(TrueMBB); return true; } - } else if (ExtractValueInst *EI = - dyn_cast<ExtractValueInst>(BI->getCondition())) { - // Check to see if the branch instruction is from an "arithmetic with - // overflow" intrinsic. The main way these intrinsics are used is: - // - // %t = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) - // %sum = extractvalue { i32, i1 } %t, 0 - // %obit = extractvalue { i32, i1 } %t, 1 - // br i1 %obit, label %overflow, label %normal - // - // The %sum and %obit are converted in an ADD and a SETO/SETB before - // reaching the branch. Therefore, we search backwards through the MBB - // looking for the SETO/SETB instruction. If an instruction modifies the - // EFLAGS register before we reach the SETO/SETB instruction, then we can't - // convert the branch into a JO/JB instruction. - if (const IntrinsicInst *CI = - dyn_cast<IntrinsicInst>(EI->getAggregateOperand())){ - if (CI->getIntrinsicID() == Intrinsic::sadd_with_overflow || - CI->getIntrinsicID() == Intrinsic::uadd_with_overflow) { - const MachineInstr *SetMI = 0; - unsigned Reg = getRegForValue(EI); - - for (MachineBasicBlock::const_reverse_iterator - RI = FuncInfo.MBB->rbegin(), RE = FuncInfo.MBB->rend(); - RI != RE; ++RI) { - const MachineInstr &MI = *RI; - - if (MI.definesRegister(Reg)) { - if (MI.isCopy()) { - Reg = MI.getOperand(1).getReg(); - continue; - } - - SetMI = &MI; - break; - } - - const TargetInstrDesc &TID = MI.getDesc(); - if (TID.hasImplicitDefOfPhysReg(X86::EFLAGS) || - MI.hasUnmodeledSideEffects()) - break; - } - - if (SetMI) { - unsigned OpCode = SetMI->getOpcode(); - - if (OpCode == X86::SETOr || OpCode == X86::SETBr) { - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(OpCode == X86::SETOr ? X86::JO_4 : X86::JB_4)) - .addMBB(TrueMBB); - FastEmitBranch(FalseMBB, DL); - FuncInfo.MBB->addSuccessor(TrueMBB); - return true; - } - } - } - } } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which // typically happen for _Bool and C++ bools. @@ -1086,13 +1065,13 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { if (OpReg == 0) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TestOpc)) .addReg(OpReg).addImm(1); - + unsigned JmpOpc = X86::JNE_4; if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { std::swap(TrueMBB, FalseMBB); JmpOpc = X86::JE_4; } - + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(JmpOpc)) .addMBB(TrueMBB); FastEmitBranch(FalseMBB, DL); @@ -1266,18 +1245,13 @@ bool X86FastISel::X86SelectFPTrunc(const Instruction *I) { } bool X86FastISel::X86SelectTrunc(const Instruction *I) { - if (Subtarget->is64Bit()) - // All other cases should be handled by the tblgen generated code. - return false; EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()); EVT DstVT = TLI.getValueType(I->getType()); - // This code only handles truncation to byte right now. + // This code only handles truncation to byte. if (DstVT != MVT::i8 && DstVT != MVT::i1) - // All other cases should be handled by the tblgen generated code. return false; - if (SrcVT != MVT::i16 && SrcVT != MVT::i32) - // All other cases should be handled by the tblgen generated code. + if (!TLI.isTypeLegal(SrcVT)) return false; unsigned InputReg = getRegForValue(I->getOperand(0)); @@ -1285,16 +1259,26 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) { // Unhandled operand. Halt "fast" selection and bail. return false; - // First issue a copy to GR16_ABCD or GR32_ABCD. - const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16) - ? X86::GR16_ABCDRegisterClass : X86::GR32_ABCDRegisterClass; - unsigned CopyReg = createResultReg(CopyRC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), - CopyReg).addReg(InputReg); + if (SrcVT == MVT::i8) { + // Truncate from i8 to i1; no code needed. + UpdateValueMap(I, InputReg); + return true; + } + + if (!Subtarget->is64Bit()) { + // If we're on x86-32; we can't extract an i8 from a general register. + // First issue a copy to GR16_ABCD or GR32_ABCD. + const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16) + ? X86::GR16_ABCDRegisterClass : X86::GR32_ABCDRegisterClass; + unsigned CopyReg = createResultReg(CopyRC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + CopyReg).addReg(InputReg); + InputReg = CopyReg; + } - // Then issue an extract_subreg. + // Issue an extract_subreg. unsigned ResultReg = FastEmitInst_extractsubreg(MVT::i8, - CopyReg, /*Kill=*/true, + InputReg, /*Kill=*/true, X86::sub_8bit); if (!ResultReg) return false; @@ -1303,36 +1287,18 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) { return true; } -bool X86FastISel::X86SelectExtractValue(const Instruction *I) { - const ExtractValueInst *EI = cast<ExtractValueInst>(I); - const Value *Agg = EI->getAggregateOperand(); - - if (const IntrinsicInst *CI = dyn_cast<IntrinsicInst>(Agg)) { - switch (CI->getIntrinsicID()) { - default: break; - case Intrinsic::sadd_with_overflow: - case Intrinsic::uadd_with_overflow: { - // Cheat a little. We know that the registers for "add" and "seto" are - // allocated sequentially. However, we only keep track of the register - // for "add" in the value map. Use extractvalue's index to get the - // correct register for "seto". - unsigned OpReg = getRegForValue(Agg); - if (OpReg == 0) - return false; - UpdateValueMap(I, OpReg + *EI->idx_begin()); - return true; - } - } - } - - return false; +bool X86FastISel::IsMemcpySmall(uint64_t Len) { + return Len <= (Subtarget->is64Bit() ? 32 : 16); } bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, X86AddressMode SrcAM, uint64_t Len) { + // Make sure we don't bloat code by inlining very large memcpy's. - bool i64Legal = TLI.isTypeLegal(MVT::i64); - if (Len > (i64Legal ? 32 : 16)) return false; + if (!IsMemcpySmall(Len)) + return false; + + bool i64Legal = Subtarget->is64Bit(); // We don't care about alignment here since we just emit integer accesses. while (Len) { @@ -1369,20 +1335,44 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { case Intrinsic::memcpy: { const MemCpyInst &MCI = cast<MemCpyInst>(I); // Don't handle volatile or variable length memcpys. - if (MCI.isVolatile() || !isa<ConstantInt>(MCI.getLength())) + if (MCI.isVolatile()) + return false; + + if (isa<ConstantInt>(MCI.getLength())) { + // Small memcpy's are common enough that we want to do them + // without a call if possible. + uint64_t Len = cast<ConstantInt>(MCI.getLength())->getZExtValue(); + if (IsMemcpySmall(Len)) { + X86AddressMode DestAM, SrcAM; + if (!X86SelectAddress(MCI.getRawDest(), DestAM) || + !X86SelectAddress(MCI.getRawSource(), SrcAM)) + return false; + TryEmitSmallMemcpy(DestAM, SrcAM, Len); + return true; + } + } + + unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; + if (!MCI.getLength()->getType()->isIntegerTy(SizeWidth)) return false; - uint64_t Len = cast<ConstantInt>(MCI.getLength())->getZExtValue(); - - // Get the address of the dest and source addresses. - X86AddressMode DestAM, SrcAM; - if (!X86SelectAddress(MCI.getRawDest(), DestAM) || - !X86SelectAddress(MCI.getRawSource(), SrcAM)) + if (MCI.getSourceAddressSpace() > 255 || MCI.getDestAddressSpace() > 255) return false; - return TryEmitSmallMemcpy(DestAM, SrcAM, Len); + return DoSelectCall(&I, "memcpy"); + } + case Intrinsic::memset: { + const MemSetInst &MSI = cast<MemSetInst>(I); + + unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; + if (!MSI.getLength()->getType()->isIntegerTy(SizeWidth)) + return false; + + if (MSI.getDestAddressSpace() > 255) + return false; + + return DoSelectCall(&I, "memset"); } - case Intrinsic::stackprotector: { // Emit code inline code to store the stack guard onto the stack. EVT PtrTy = TLI.getPointerTy(); @@ -1396,29 +1386,6 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { if (!X86FastEmitStore(PtrTy, Op1, AM)) return false; return true; } - case Intrinsic::objectsize: { - // FIXME: This should be moved to generic code! - ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1)); - const Type *Ty = I.getCalledFunction()->getReturnType(); - - MVT VT; - if (!isTypeLegal(Ty, VT)) - return false; - - unsigned OpC = 0; - if (VT == MVT::i32) - OpC = X86::MOV32ri; - else if (VT == MVT::i64) - OpC = X86::MOV64ri; - else - return false; - - unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpC), ResultReg). - addImm(CI->isZero() ? -1ULL : 0); - UpdateValueMap(&I, ResultReg); - return true; - } case Intrinsic::dbg_declare: { const DbgDeclareInst *DI = cast<DbgDeclareInst>(&I); X86AddressMode AM; @@ -1439,12 +1406,9 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { case Intrinsic::sadd_with_overflow: case Intrinsic::uadd_with_overflow: { // FIXME: Should fold immediates. - + // Replace "add with overflow" intrinsics with an "add" instruction followed - // by a seto/setc instruction. Later on, when the "extractvalue" - // instructions are encountered, we use the fact that two registers were - // created sequentially to get the correct registers for the "sum" and the - // "overflow bit". + // by a seto/setc instruction. const Function *Callee = I.getCalledFunction(); const Type *RetTy = cast<StructType>(Callee->getReturnType())->getTypeAtIndex(unsigned(0)); @@ -1470,27 +1434,18 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { else return false; - unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + // The call to CreateRegs builds two sequential registers, to store the + // both the the returned values. + unsigned ResultReg = FuncInfo.CreateRegs(I.getType()); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpC), ResultReg) .addReg(Reg1).addReg(Reg2); - unsigned DestReg1 = UpdateValueMap(&I, ResultReg); - - // If the add with overflow is an intra-block value then we just want to - // create temporaries for it like normal. If it is a cross-block value then - // UpdateValueMap will return the cross-block register used. Since we - // *really* want the value to be live in the register pair known by - // UpdateValueMap, we have to use DestReg1+1 as the destination register in - // the cross block case. In the non-cross-block case, we should just make - // another register for the value. - if (DestReg1 != ResultReg) - ResultReg = DestReg1+1; - else - ResultReg = createResultReg(TLI.getRegClassFor(MVT::i8)); unsigned Opc = X86::SETBr; if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow) Opc = X86::SETOr; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg+1); + + UpdateValueMap(&I, ResultReg, 2); return true; } } @@ -1508,6 +1463,14 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) return X86VisitIntrinsicCall(*II); + return DoSelectCall(I, 0); +} + +// Select either a call, or an llvm.memcpy/memmove/memset intrinsic +bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { + const CallInst *CI = cast<CallInst>(I); + const Value *Callee = CI->getCalledValue(); + // Handle only C and fastcc calling conventions for now. ImmutableCallSite CS(CI); CallingConv::ID CC = CS.getCallingConv(); @@ -1533,12 +1496,15 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { if (Subtarget->IsCalleePop(isVarArg, CC)) return false; - // Handle *simple* calls for now. - const Type *RetTy = CS.getType(); - MVT RetVT; - if (RetTy->isVoidTy()) - RetVT = MVT::isVoid; - else if (!isTypeLegal(RetTy, RetVT, true)) + // Check whether the function can return without sret-demotion. + SmallVector<ISD::OutputArg, 4> Outs; + SmallVector<uint64_t, 4> Offsets; + GetReturnInfo(I->getType(), CS.getAttributes().getRetAttributes(), + Outs, TLI, &Offsets); + bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(), + *FuncInfo.MF, FTy->isVarArg(), + Outs, FTy->getContext()); + if (!CanLowerReturn) return false; // Materialize callee address in a register. FIXME: GV address can be @@ -1555,13 +1521,6 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { } else return false; - // Allow calls which produce i1 results. - bool AndToI1 = false; - if (RetVT == MVT::i1) { - RetVT = MVT::i8; - AndToI1 = true; - } - // Deal with call operands first. SmallVector<const Value *, 8> ArgVals; SmallVector<unsigned, 8> Args; @@ -1573,6 +1532,10 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { ArgFlags.reserve(CS.arg_size()); for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); i != e; ++i) { + // If we're lowering a mem intrinsic instead of a regular call, skip the + // last two arguments, which should not passed to the underlying functions. + if (MemIntName && e-i <= 2) + break; Value *ArgVal = *i; ISD::ArgFlagsTy Flags; unsigned AttrInd = i - CS.arg_begin() + 1; @@ -1581,6 +1544,25 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { if (CS.paramHasAttr(AttrInd, Attribute::ZExt)) Flags.setZExt(); + if (CS.paramHasAttr(AttrInd, Attribute::ByVal)) { + const PointerType *Ty = cast<PointerType>(ArgVal->getType()); + const Type *ElementTy = Ty->getElementType(); + unsigned FrameSize = TD.getTypeAllocSize(ElementTy); + unsigned FrameAlign = CS.getParamAlignment(AttrInd); + if (!FrameAlign) + FrameAlign = TLI.getByValTypeAlignment(ElementTy); + Flags.setByVal(); + Flags.setByValSize(FrameSize); + Flags.setByValAlign(FrameAlign); + if (!IsMemcpySmall(FrameSize)) + return false; + } + + if (CS.paramHasAttr(AttrInd, Attribute::InReg)) + Flags.setInReg(); + if (CS.paramHasAttr(AttrInd, Attribute::Nest)) + Flags.setNest(); + // If this is an i1/i8/i16 argument, promote to i32 to avoid an extra // instruction. This is safe because it is common to all fastisel supported // calling conventions on x86. @@ -1593,9 +1575,9 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { ArgVal = ConstantExpr::getZExt(CI,Type::getInt32Ty(CI->getContext())); } } - + unsigned ArgReg; - + // Passing bools around ends up doing a trunc to i1 and passing it. // Codegen this as an argument + "and 1". if (ArgVal->getType()->isIntegerTy(1) && isa<TruncInst>(ArgVal) && @@ -1604,10 +1586,10 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { ArgVal = cast<TruncInst>(ArgVal)->getOperand(0); ArgReg = getRegForValue(ArgVal); if (ArgReg == 0) return false; - + MVT ArgVT; if (!isTypeLegal(ArgVal->getType(), ArgVT)) return false; - + ArgReg = FastEmit_ri(ArgVT, ArgVT, ISD::AND, ArgReg, ArgVal->hasOneUse(), 1); } else { @@ -1616,16 +1598,12 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { if (ArgReg == 0) return false; - // FIXME: Only handle *easy* calls for now. - if (CS.paramHasAttr(AttrInd, Attribute::InReg) || - CS.paramHasAttr(AttrInd, Attribute::Nest) || - CS.paramHasAttr(AttrInd, Attribute::ByVal)) - return false; - const Type *ArgTy = ArgVal->getType(); MVT ArgVT; if (!isTypeLegal(ArgTy, ArgVT)) return false; + if (ArgVT == MVT::x86mmx) + return false; unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy); Flags.setOrigAlign(OriginalAlignment); @@ -1637,7 +1615,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CC, isVarArg, TM, ArgLocs, I->getParent()->getContext()); + CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, ArgLocs, + I->getParent()->getContext()); // Allocate shadow area for Win64 if (Subtarget->isTargetWin64()) @@ -1666,6 +1645,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::SExt: { + assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && + "Unexpected extend"); bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), Arg, ArgVT, Arg); assert(Emitted && "Failed to emit a sext!"); (void)Emitted; @@ -1673,6 +1654,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { break; } case CCValAssign::ZExt: { + assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && + "Unexpected extend"); bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), Arg, ArgVT, Arg); assert(Emitted && "Failed to emit a zext!"); (void)Emitted; @@ -1680,9 +1663,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { break; } case CCValAssign::AExt: { - // We don't handle MMX parameters yet. - if (VA.getLocVT().isVector() && VA.getLocVT().getSizeInBits() == 128) - return false; + assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && + "Unexpected extend"); bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), Arg, ArgVT, Arg); if (!Emitted) @@ -1716,14 +1698,21 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { AM.Base.Reg = StackPtr; AM.Disp = LocMemOffset; const Value *ArgVal = ArgVals[VA.getValNo()]; - - // If this is a really simple value, emit this with the Value* version of - // X86FastEmitStore. If it isn't simple, we don't want to do this, as it - // can cause us to reevaluate the argument. - if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) + ISD::ArgFlagsTy Flags = ArgFlags[VA.getValNo()]; + + if (Flags.isByVal()) { + X86AddressMode SrcAM; + SrcAM.Base.Reg = Arg; + bool Res = TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()); + assert(Res && "memcpy length already checked!"); (void)Res; + } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) { + // If this is a really simple value, emit this with the Value* version + //of X86FastEmitStore. If it isn't simple, we don't want to do this, + // as it can cause us to reevaluate the argument. X86FastEmitStore(ArgVT, ArgVal, AM); - else + } else { X86FastEmitStore(ArgVT, Arg, AM); + } } } @@ -1793,8 +1782,11 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { } - MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc)) - .addGlobalAddress(GV, 0, OpFlags); + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc)); + if (MemIntName) + MIB.addExternalSymbol(MemIntName, OpFlags); + else + MIB.addGlobalAddress(GV, 0, OpFlags); } // Add an implicit use GOT pointer in EBX. @@ -1816,63 +1808,74 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackUp)) .addImm(NumBytes).addImm(NumBytesCallee); - // Now handle call return value (if any). - SmallVector<unsigned, 4> UsedRegs; - if (RetVT != MVT::isVoid) { - SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CC, false, TM, RVLocs, I->getParent()->getContext()); - CCInfo.AnalyzeCallResult(RetVT, RetCC_X86); + // Build info for return calling conv lowering code. + // FIXME: This is practically a copy-paste from TargetLowering::LowerCallTo. + SmallVector<ISD::InputArg, 32> Ins; + SmallVector<EVT, 4> RetTys; + ComputeValueVTs(TLI, I->getType(), RetTys); + for (unsigned i = 0, e = RetTys.size(); i != e; ++i) { + EVT VT = RetTys[i]; + EVT RegisterVT = TLI.getRegisterType(I->getParent()->getContext(), VT); + unsigned NumRegs = TLI.getNumRegisters(I->getParent()->getContext(), VT); + for (unsigned j = 0; j != NumRegs; ++j) { + ISD::InputArg MyFlags; + MyFlags.VT = RegisterVT.getSimpleVT(); + MyFlags.Used = !CS.getInstruction()->use_empty(); + if (CS.paramHasAttr(0, Attribute::SExt)) + MyFlags.Flags.setSExt(); + if (CS.paramHasAttr(0, Attribute::ZExt)) + MyFlags.Flags.setZExt(); + if (CS.paramHasAttr(0, Attribute::InReg)) + MyFlags.Flags.setInReg(); + Ins.push_back(MyFlags); + } + } - // Copy all of the result registers out of their specified physreg. - assert(RVLocs.size() == 1 && "Can't handle multi-value calls!"); - EVT CopyVT = RVLocs[0].getValVT(); - TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT); + // Now handle call return values. + SmallVector<unsigned, 4> UsedRegs; + SmallVector<CCValAssign, 16> RVLocs; + CCState CCRetInfo(CC, false, *FuncInfo.MF, TM, RVLocs, + I->getParent()->getContext()); + unsigned ResultReg = FuncInfo.CreateRegs(I->getType()); + CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86); + for (unsigned i = 0; i != RVLocs.size(); ++i) { + EVT CopyVT = RVLocs[i].getValVT(); + unsigned CopyReg = ResultReg + i; // If this is a call to a function that returns an fp value on the x87 fp // stack, but where we prefer to use the value in xmm registers, copy it // out as F80 and use a truncate to move it from fp stack reg to xmm reg. - if ((RVLocs[0].getLocReg() == X86::ST0 || - RVLocs[0].getLocReg() == X86::ST1) && + if ((RVLocs[i].getLocReg() == X86::ST0 || + RVLocs[i].getLocReg() == X86::ST1) && isScalarFPTypeInSSEReg(RVLocs[0].getValVT())) { CopyVT = MVT::f80; - DstRC = X86::RFP80RegisterClass; + CopyReg = createResultReg(X86::RFP80RegisterClass); } - unsigned ResultReg = createResultReg(DstRC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), - ResultReg).addReg(RVLocs[0].getLocReg()); - UsedRegs.push_back(RVLocs[0].getLocReg()); + CopyReg).addReg(RVLocs[i].getLocReg()); + UsedRegs.push_back(RVLocs[i].getLocReg()); - if (CopyVT != RVLocs[0].getValVT()) { + if (CopyVT != RVLocs[i].getValVT()) { // Round the F80 the right size, which also moves to the appropriate xmm // register. This is accomplished by storing the F80 value in memory and // then loading it back. Ewww... - EVT ResVT = RVLocs[0].getValVT(); + EVT ResVT = RVLocs[i].getValVT(); unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64; unsigned MemSize = ResVT.getSizeInBits()/8; int FI = MFI.CreateStackObject(MemSize, MemSize, false); addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc)), FI) - .addReg(ResultReg); - DstRC = ResVT == MVT::f32 - ? X86::FR32RegisterClass : X86::FR64RegisterClass; + .addReg(CopyReg); Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm; - ResultReg = createResultReg(DstRC); addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(Opc), ResultReg), FI); + TII.get(Opc), ResultReg + i), FI); } - - if (AndToI1) { - // Mask out all but lowest bit for some call which produces an i1. - unsigned AndResult = createResultReg(X86::GR8RegisterClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(X86::AND8ri), AndResult).addReg(ResultReg).addImm(1); - ResultReg = AndResult; - } - - UpdateValueMap(I, ResultReg); } + if (RVLocs.size()) + UpdateValueMap(I, ResultReg, RVLocs.size()); + // Set all unused physreg defs as dead. static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI); @@ -1911,8 +1914,6 @@ X86FastISel::TargetSelectInstruction(const Instruction *I) { return X86SelectFPExt(I); case Instruction::FPTrunc: return X86SelectFPTrunc(I); - case Instruction::ExtractValue: - return X86SelectExtractValue(I); case Instruction::IntToPtr: // Deliberate fall-through. case Instruction::PtrToInt: { EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()); @@ -1990,7 +1991,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { if (AM.BaseType == X86AddressMode::RegBase && AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == 0) return AM.Base.Reg; - + Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; unsigned ResultReg = createResultReg(RC); addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 06d12fc..2e95300 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -355,7 +355,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { MachineModuleInfo &MMI = MF.getMMI(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); bool needsFrameMoves = MMI.hasDebugInfo() || - !Fn->doesNotThrow() || UnwindTablesMandatory; + Fn->needsUnwindTableEntry(); uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate. bool HasFP = hasFP(MF); diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 4534e85..1fcc274 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -189,6 +189,7 @@ namespace { SDNode *Select(SDNode *N); SDNode *SelectAtomic64(SDNode *Node, unsigned Opc); SDNode *SelectAtomicLoadAdd(SDNode *Node, EVT NVT); + SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT); bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); bool MatchWrapper(SDValue N, X86ISelAddressMode &AM); @@ -1329,6 +1330,8 @@ SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) { return ResNode; } +// FIXME: Figure out some way to unify this with the 'or' and other code +// below. SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) { if (Node->hasAnyUseOfValue(0)) return 0; @@ -1479,6 +1482,158 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) { } } +enum AtomicOpc { + OR, + AND, + XOR, + AtomicOpcEnd +}; + +enum AtomicSz { + ConstantI8, + I8, + SextConstantI16, + ConstantI16, + I16, + SextConstantI32, + ConstantI32, + I32, + SextConstantI64, + ConstantI64, + I64, + AtomicSzEnd +}; + +static const unsigned int AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = { + { + X86::LOCK_OR8mi, + X86::LOCK_OR8mr, + X86::LOCK_OR16mi8, + X86::LOCK_OR16mi, + X86::LOCK_OR16mr, + X86::LOCK_OR32mi8, + X86::LOCK_OR32mi, + X86::LOCK_OR32mr, + X86::LOCK_OR64mi8, + X86::LOCK_OR64mi32, + X86::LOCK_OR64mr + }, + { + X86::LOCK_AND8mi, + X86::LOCK_AND8mr, + X86::LOCK_AND16mi8, + X86::LOCK_AND16mi, + X86::LOCK_AND16mr, + X86::LOCK_AND32mi8, + X86::LOCK_AND32mi, + X86::LOCK_AND32mr, + X86::LOCK_AND64mi8, + X86::LOCK_AND64mi32, + X86::LOCK_AND64mr + }, + { + X86::LOCK_XOR8mi, + X86::LOCK_XOR8mr, + X86::LOCK_XOR16mi8, + X86::LOCK_XOR16mi, + X86::LOCK_XOR16mr, + X86::LOCK_XOR32mi8, + X86::LOCK_XOR32mi, + X86::LOCK_XOR32mr, + X86::LOCK_XOR64mi8, + X86::LOCK_XOR64mi32, + X86::LOCK_XOR64mr + } +}; + +SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) { + if (Node->hasAnyUseOfValue(0)) + return 0; + + // Optimize common patterns for __sync_or_and_fetch and similar arith + // operations where the result is not used. This allows us to use the "lock" + // version of the arithmetic instruction. + // FIXME: Same as for 'add' and 'sub', try to merge those down here. + SDValue Chain = Node->getOperand(0); + SDValue Ptr = Node->getOperand(1); + SDValue Val = Node->getOperand(2); + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) + return 0; + + // Which index into the table. + enum AtomicOpc Op; + switch (Node->getOpcode()) { + case ISD::ATOMIC_LOAD_OR: + Op = OR; + break; + case ISD::ATOMIC_LOAD_AND: + Op = AND; + break; + case ISD::ATOMIC_LOAD_XOR: + Op = XOR; + break; + default: + return 0; + } + + bool isCN = false; + ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val); + if (CN) { + isCN = true; + Val = CurDAG->getTargetConstant(CN->getSExtValue(), NVT); + } + + unsigned Opc = 0; + switch (NVT.getSimpleVT().SimpleTy) { + default: return 0; + case MVT::i8: + if (isCN) + Opc = AtomicOpcTbl[Op][ConstantI8]; + else + Opc = AtomicOpcTbl[Op][I8]; + break; + case MVT::i16: + if (isCN) { + if (immSext8(Val.getNode())) + Opc = AtomicOpcTbl[Op][SextConstantI16]; + else + Opc = AtomicOpcTbl[Op][ConstantI16]; + } else + Opc = AtomicOpcTbl[Op][I16]; + break; + case MVT::i32: + if (isCN) { + if (immSext8(Val.getNode())) + Opc = AtomicOpcTbl[Op][SextConstantI32]; + else + Opc = AtomicOpcTbl[Op][ConstantI32]; + } else + Opc = AtomicOpcTbl[Op][I32]; + break; + case MVT::i64: + if (isCN) { + if (immSext8(Val.getNode())) + Opc = AtomicOpcTbl[Op][SextConstantI64]; + else if (i64immSExt32(Val.getNode())) + Opc = AtomicOpcTbl[Op][ConstantI64]; + } else + Opc = AtomicOpcTbl[Op][I64]; + break; + } + + DebugLoc dl = Node->getDebugLoc(); + SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + dl, NVT), 0); + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemSDNode>(Node)->getMemOperand(); + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain }; + SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 7), 0); + cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1); + SDValue RetVals[] = { Undef, Ret }; + return CurDAG->getMergeValues(RetVals, 2, dl).getNode(); +} + /// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has /// any uses which require the SF or OF bits to be accurate. static bool HasNoSignedComparisonUses(SDNode *N) { @@ -1580,6 +1735,14 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { return RetVal; break; } + case ISD::ATOMIC_LOAD_XOR: + case ISD::ATOMIC_LOAD_AND: + case ISD::ATOMIC_LOAD_OR: { + SDNode *RetVal = SelectAtomicLoadArith(Node, NVT); + if (RetVal) + return RetVal; + break; + } case ISD::AND: case ISD::OR: case ISD::XOR: { @@ -1843,17 +2006,17 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { if (TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; Move = - SDValue(CurDAG->getMachineNode(X86::MOVZX16rm8, dl, MVT::i16, + SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32, MVT::Other, Ops, array_lengthof(Ops)), 0); Chain = Move.getValue(1); ReplaceUses(N0.getValue(1), Chain); } else { Move = - SDValue(CurDAG->getMachineNode(X86::MOVZX16rr8, dl, MVT::i16, N0),0); + SDValue(CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0),0); Chain = CurDAG->getEntryNode(); } - Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, Move, SDValue()); + Chain = CurDAG->getCopyToReg(Chain, dl, X86::EAX, Move, SDValue()); InFlag = Chain.getValue(1); } else { InFlag = diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 703c01d..294a6a7 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -222,7 +222,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // X86 is weird, it always uses i8 for shift amounts and setcc results. setBooleanContents(ZeroOrOneBooleanContent); - + // For 64-bit since we have so many registers use the ILP scheduler, for // 32-bit code use the register pressure specific scheduling. if (Subtarget->is64Bit()) @@ -574,6 +574,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); + // Lower this to FGETSIGNx86 plus an AND. + setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); + setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); + // We don't support sin/cos/fmod setOperationAction(ISD::FSIN , MVT::f64, Expand); setOperationAction(ISD::FCOS , MVT::f64, Expand); @@ -927,7 +931,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // Can turn SHL into an integer multiply. setOperationAction(ISD::SHL, MVT::v4i32, Custom); setOperationAction(ISD::SHL, MVT::v16i8, Custom); - setOperationAction(ISD::SRL, MVT::v4i32, Legal); // i8 and i16 vectors are custom , because the source register and source // source memory operand types are not the same width. f32 vectors are @@ -949,6 +952,19 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } } + if (Subtarget->hasSSE2()) { + setOperationAction(ISD::SRL, MVT::v2i64, Custom); + setOperationAction(ISD::SRL, MVT::v4i32, Custom); + setOperationAction(ISD::SRL, MVT::v16i8, Custom); + + setOperationAction(ISD::SHL, MVT::v2i64, Custom); + setOperationAction(ISD::SHL, MVT::v4i32, Custom); + setOperationAction(ISD::SHL, MVT::v8i16, Custom); + + setOperationAction(ISD::SRA, MVT::v4i32, Custom); + setOperationAction(ISD::SRA, MVT::v8i16, Custom); + } + if (Subtarget->hasSSE42()) setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); @@ -1081,6 +1097,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::SINT_TO_FP); if (Subtarget->is64Bit()) setTargetDAGCombine(ISD::MUL); @@ -1096,6 +1113,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; setPrefLoopAlignment(16); benefitFromCodePlacementOpt = true; + + setPrefFunctionAlignment(4); } @@ -1247,11 +1266,6 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); } -/// getFunctionAlignment - Return the Log2 alignment of this function. -unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { - return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; -} - // FIXME: Why this routine is here? Move to RegInfo! std::pair<const TargetRegisterClass*, uint8_t> X86TargetLowering::findRepresentativeClass(EVT VT) const{ @@ -1306,11 +1320,12 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, #include "X86GenCallingConv.inc" bool -X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, +X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, + MachineFunction &MF, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_X86); } @@ -1325,7 +1340,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); @@ -1476,8 +1491,8 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; bool Is64Bit = Subtarget->is64Bit(); - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); // Copy all of the result registers out of their specified physreg. @@ -1518,20 +1533,6 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, // This truncation won't change the value. DAG.getIntPtrConstant(1)); - } else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { - // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. - if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { - Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), - MVT::v2i64, InFlag).getValue(1); - Val = Chain.getValue(0); - Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, - Val, DAG.getConstant(0, MVT::i64)); - } else { - Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), - MVT::i64, InFlag).getValue(1); - Val = Chain.getValue(0); - } - Val = DAG.getNode(ISD::BITCAST, dl, CopyVT, Val); } else { Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag).getValue(1); @@ -1680,7 +1681,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 @@ -2007,7 +2008,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 @@ -2530,16 +2531,30 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (RegInfo->needsStackRealignment(MF)) return false; - // Do not sibcall optimize vararg calls unless the call site is not passing - // any arguments. - if (isVarArg && !Outs.empty()) - return false; - // Also avoid sibcall optimization if either caller or callee uses struct // return semantics. if (isCalleeStructRet || isCallerStructRet) return false; + // Do not sibcall optimize vararg calls unless all arguments are passed via + // registers. + if (isVarArg && !Outs.empty()) { + + // Optimizing for varargs on Win64 is unlikely to be safe without + // additional testing. + if (Subtarget->isTargetWin64()) + return false; + + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); + + CCInfo.AnalyzeCallOperands(Outs, CC_X86); + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) + if (!ArgLocs[i].isRegLoc()) + return false; + } + // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. // Therefore if it's not used by the call it is not safe to optimize this into // a sibcall. @@ -2552,8 +2567,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, } if (Unused) { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CalleeCC, false, getTargetMachine(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; @@ -2566,13 +2581,13 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // results are returned in the same way as what the caller expects. if (!CCMatch) { SmallVector<CCValAssign, 16> RVLocs1; - CCState CCInfo1(CalleeCC, false, getTargetMachine(), - RVLocs1, *DAG.getContext()); + CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), + getTargetMachine(), RVLocs1, *DAG.getContext()); CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); SmallVector<CCValAssign, 16> RVLocs2; - CCState CCInfo2(CallerCC, false, getTargetMachine(), - RVLocs2, *DAG.getContext()); + CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), + getTargetMachine(), RVLocs2, *DAG.getContext()); CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); if (RVLocs1.size() != RVLocs2.size()) @@ -2598,8 +2613,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // Check if stack adjustment is needed. For now, do not do this if any // argument is passed on the stack. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), - ArgLocs, *DAG.getContext()); + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (Subtarget->isTargetWin64()) { @@ -6619,9 +6634,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { } -/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and +/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and /// take a 2 x i32 value to shift plus a shift amount. -SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); @@ -6710,12 +6725,18 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, unsigned ByteSize = SrcVT.getSizeInBits()/8; - int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); - MachineMemOperand *MMO = - DAG.getMachineFunction() - .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOLoad, ByteSize, ByteSize); - + FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); + MachineMemOperand *MMO; + if (FI) { + int SSFI = FI->getIndex(); + MMO = + DAG.getMachineFunction() + .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), + MachineMemOperand::MOLoad, ByteSize, ByteSize); + } else { + MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); + StackSlot = StackSlot.getOperand(1); + } SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL, @@ -7206,6 +7227,17 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); } +SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const { + SDValue N0 = Op.getOperand(0); + DebugLoc dl = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + + // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). + SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, + DAG.getConstant(1, VT)); + return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); +} + /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, @@ -8781,16 +8813,71 @@ SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { return Res; } -SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); SDValue R = Op.getOperand(0); + SDValue Amt = Op.getOperand(1); LLVMContext *Context = DAG.getContext(); - assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later"); + // Must have SSE2. + if (!Subtarget->hasSSE2()) return SDValue(); + + // Optimize shl/srl/sra with constant shift amount. + if (isSplatVector(Amt.getNode())) { + SDValue SclrAmt = Amt->getOperand(0); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { + uint64_t ShiftAmt = C->getZExtValue(); + + if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SHL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SHL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRA) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRA) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + } + } + + // Lower SHL with variable shift amount. + // Cannot lower SHL without SSE4.1 or later. + if (!Subtarget->hasSSE41()) return SDValue(); - if (VT == MVT::v4i32) { + if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), Op.getOperand(1), DAG.getConstant(23, MVT::i32)); @@ -8809,7 +8896,7 @@ SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); return DAG.getNode(ISD::MUL, dl, VT, Op, R); } - if (VT == MVT::v16i8) { + if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { // a = a << 5; Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), @@ -9114,7 +9201,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::SHL_PARTS: case ISD::SRA_PARTS: - case ISD::SRL_PARTS: return LowerShift(Op, DAG); + case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); @@ -9122,6 +9209,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FABS: return LowerFABS(Op, DAG); case ISD::FNEG: return LowerFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); + case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::VSETCC: return LowerVSETCC(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); @@ -9142,7 +9230,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::CTLZ: return LowerCTLZ(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op, DAG); case ISD::MUL: return LowerMUL_V2I64(Op, DAG); - case ISD::SHL: return LowerSHL(Op, DAG); + case ISD::SRA: + case ISD::SRL: + case ISD::SHL: return LowerShift(Op, DAG); case ISD::SADDO: case ISD::UADDO: case ISD::SSUBO: @@ -9309,6 +9399,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UCOMI: return "X86ISD::UCOMI"; case X86ISD::SETCC: return "X86ISD::SETCC"; case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; + case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd"; + case X86ISD::FSETCCss: return "X86ISD::FSETCCss"; case X86ISD::CMOV: return "X86ISD::CMOV"; case X86ISD::BRCOND: return "X86ISD::BRCOND"; case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; @@ -10986,14 +11078,14 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, UE = Uses.end(); UI != UE; ++UI) { SDNode *Extract = *UI; - // Compute the element's address. + // cOMpute the element's address. SDValue Idx = Extract->getOperand(1); unsigned EltSize = InputVector.getValueType().getVectorElementType().getSizeInBits()/8; uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); - SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), + SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), StackPtr, OffsetVal); // Load the scalar. @@ -11266,15 +11358,28 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) return SDValue(); + SDValue FalseOp = N->getOperand(0); + SDValue TrueOp = N->getOperand(1); + X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); + SDValue Cond = N->getOperand(3); + if (CC == X86::COND_E || CC == X86::COND_NE) { + switch (Cond.getOpcode()) { + default: break; + case X86ISD::BSR: + case X86ISD::BSF: + // If operand of BSR / BSF are proven never zero, then ZF cannot be set. + if (DAG.isKnownNeverZero(Cond.getOperand(0))) + return (CC == X86::COND_E) ? FalseOp : TrueOp; + } + } + // If this is a select between two integer constants, try to do some // optimizations. Note that the operands are ordered the opposite of SELECT // operands. - if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { - if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { + if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { + if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is // larger than FalseC (the false value). - X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); - if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { CC = X86::GetOppositeBranchCondition(CC); std::swap(TrueC, FalseC); @@ -11284,7 +11389,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, // This is efficient for any integer data type (including i8/i16) and // shift amount. if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { - SDValue Cond = N->getOperand(3); Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, DAG.getConstant(CC, MVT::i8), Cond); @@ -11302,7 +11406,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient // for any integer data type, including i8/i16. if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { - SDValue Cond = N->getOperand(3); Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, DAG.getConstant(CC, MVT::i8), Cond); @@ -11341,7 +11444,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, if (isFastMultiplier) { APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); - SDValue Cond = N->getOperand(3); Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, DAG.getConstant(CC, MVT::i8), Cond); // Zero extend the condition if needed. @@ -11576,12 +11678,94 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, } +// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) +// where both setccs reference the same FP CMP, and rewrite for CMPEQSS +// and friends. Likewise for OR -> CMPNEQSS. +static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + unsigned opcode; + + // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but + // we're requiring SSE2 for both. + if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue CMP0 = N0->getOperand(1); + SDValue CMP1 = N1->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + + // The SETCCs should both refer to the same CMP. + if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) + return SDValue(); + + SDValue CMP00 = CMP0->getOperand(0); + SDValue CMP01 = CMP0->getOperand(1); + EVT VT = CMP00.getValueType(); + + if (VT == MVT::f32 || VT == MVT::f64) { + bool ExpectingFlags = false; + // Check for any users that want flags: + for (SDNode::use_iterator UI = N->use_begin(), + UE = N->use_end(); + !ExpectingFlags && UI != UE; ++UI) + switch (UI->getOpcode()) { + default: + case ISD::BR_CC: + case ISD::BRCOND: + case ISD::SELECT: + ExpectingFlags = true; + break; + case ISD::CopyToReg: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + break; + } + + if (!ExpectingFlags) { + enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); + enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); + + if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { + X86::CondCode tmp = cc0; + cc0 = cc1; + cc1 = tmp; + } + + if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || + (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { + bool is64BitFP = (CMP00.getValueType() == MVT::f64); + X86ISD::NodeType NTOperator = is64BitFP ? + X86ISD::FSETCCsd : X86ISD::FSETCCss; + // FIXME: need symbolic constants for these magic numbers. + // See X86ATTInstPrinter.cpp:printSSECC(). + unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; + SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01, + DAG.getConstant(x86cc, MVT::i8)); + SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32, + OnesOrZeroesF); + SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI, + DAG.getConstant(1, MVT::i32)); + SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); + return OneBitOfTruth; + } + } + } + } + return SDValue(); +} + static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); + SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); + if (R.getNode()) + return R; + // Want to form PANDN nodes, in the hopes of then easily combining them with // OR and AND nodes to form PBLEND/PSIGN. EVT VT = N->getValueType(0); @@ -11611,6 +11795,10 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); + SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); + if (R.getNode()) + return R; + EVT VT = N->getValueType(0); if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64) return SDValue(); @@ -11978,6 +12166,26 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, const X86TargetLowering *XTLI) { + DebugLoc dl = N->getDebugLoc(); + SDValue Op0 = N->getOperand(0); + // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have + // a 32-bit target where SSE doesn't support i64->FP operations. + if (Op0.getOpcode() == ISD::LOAD) { + LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); + EVT VT = Ld->getValueType(0); + if (!Ld->isVolatile() && !N->getValueType(0).isVector() && + ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && + !XTLI->getSubtarget()->is64Bit() && + !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { + SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG); + DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); + return FILDChain; + } + } + return SDValue(); +} + // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, X86TargetLowering::DAGCombinerInfo &DCI) { @@ -12062,6 +12270,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); + case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); case X86ISD::FXOR: case X86ISD::FOR: return PerformFORCombine(N, DAG); case X86ISD::FAND: return PerformFANDCombine(N, DAG); @@ -12491,12 +12700,16 @@ LowerXConstraint(EVT ConstraintVT) const { /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, - char Constraint, + std::string &Constraint, std::vector<SDValue>&Ops, SelectionDAG &DAG) const { SDValue Result(0, 0); - switch (Constraint) { + // Only support length 1 constraints for now. + if (Constraint.length() > 1) return; + + char ConstraintLetter = Constraint[0]; + switch (ConstraintLetter) { default: break; case 'I': if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { @@ -12688,7 +12901,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, return std::make_pair(0U, X86::GR8RegisterClass); if (VT == MVT::i16) return std::make_pair(0U, X86::GR16RegisterClass); - if (VT == MVT::i32 || !Subtarget->is64Bit()) + if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) return std::make_pair(0U, X86::GR32RegisterClass); return std::make_pair(0U, X86::GR64RegisterClass); case 'R': // LEGACY_REGS diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 6301057..d61a125 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -94,6 +94,15 @@ namespace llvm { // one's or all zero's. SETCC_CARRY, // R = carry_bit ? ~0 : 0 + /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD. + /// Operands are two FP values to compare; result is a mask of + /// 0s or 1s. Generally DTRT for C/C++ with NaNs. + FSETCCss, FSETCCsd, + + /// X86 MOVMSK{pd|ps}, extracts sign bits of two or four FP values, + /// result in an integer GPR. Needs masking for scalar result. + FGETSIGNx86, + /// X86 conditional moves. Operand 0 and operand 1 are the two values /// to select from. Operand 2 is the condition code, and operand 3 is the /// flag operand produced by a CMP or TEST instruction. It also writes a @@ -592,7 +601,7 @@ namespace llvm { /// true it means one of the asm constraint of the inline asm instruction /// being processed is 'm'. virtual void LowerAsmOperandForConstraint(SDValue Op, - char ConstraintLetter, + std::string &Constraint, std::vector<SDValue> &Ops, SelectionDAG &DAG) const; @@ -674,15 +683,15 @@ namespace llvm { /// or null if the target does not support "fast" ISel. virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo) const; - /// getFunctionAlignment - Return the Log2 alignment of this function. - virtual unsigned getFunctionAlignment(const Function *F) const; - /// getStackCookieLocation - Return true if the target stores stack /// protector cookies at a fixed offset in some non-standard address /// space, and populates the address space and offset as /// appropriate. virtual bool getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const; + SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, + SelectionDAG &DAG) const; + protected: std::pair<const TargetRegisterClass*, uint8_t> findRepresentativeClass(EVT VT) const; @@ -773,9 +782,7 @@ namespace llvm { SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const; - SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, - SelectionDAG &DAG) const; + SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBITCAST(SDValue op, SelectionDAG &DAG) const; SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; @@ -786,6 +793,7 @@ namespace llvm { SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerToBT(SDValue And, ISD::CondCode CC, DebugLoc dl, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; @@ -808,7 +816,7 @@ namespace llvm { SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSHL(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const; SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const; @@ -850,9 +858,10 @@ namespace llvm { ISD::NodeType ExtendKind) const; virtual bool - CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::OutputArg> &Outs, - LLVMContext &Context) const; + CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const; void ReplaceATOMIC_BINARY_64(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG, unsigned NewOp) const; diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 4c915d9..33534cd 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -214,6 +214,30 @@ def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), (SETBr)>; +// (add OP, SETB) -> (adc OP, 0) +def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op), + (ADC8ri GR8:$op, 0)>; +def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op), + (ADC32ri8 GR32:$op, 0)>; +def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op), + (ADC64ri8 GR64:$op, 0)>; + +// (sub OP, SETB) -> (sbb OP, 0) +def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB8ri GR8:$op, 0)>; +def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB32ri8 GR32:$op, 0)>; +def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB64ri8 GR64:$op, 0)>; + +// (sub OP, SETCC_CARRY) -> (adc OP, 0) +def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC8ri GR8:$op, 0)>; +def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC32ri8 GR32:$op, 0)>; +def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC64ri8 GR64:$op, 0)>; + //===----------------------------------------------------------------------===// // String Pseudo Instructions // @@ -519,85 +543,98 @@ def Int_MemBarrierNoSSE64 : RI<0x09, MRM1r, (outs), (ins GR64:$zero), Requires<[In64BitMode]>, LOCK; -// Optimized codegen when the non-memory output is not used. +// RegOpc corresponds to the mr version of the instruction +// ImmOpc corresponds to the mi version of the instruction +// ImmOpc8 corresponds to the mi8 version of the instruction +// ImmMod corresponds to the instruction format of the mi and mi8 versions +multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8, + Format ImmMod, string mnemonic> { let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in { -def LOCK_ADD8mr : I<0x00, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2), - "lock\n\t" - "add{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_ADD16mr : I<0x01, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), - "lock\n\t" - "add{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK; -def LOCK_ADD32mr : I<0x01, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), - "lock\n\t" - "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_ADD64mr : RI<0x01, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), - "lock\n\t" - "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_ADD8mi : Ii8<0x80, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src2), - "lock\n\t" - "add{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_ADD16mi : Ii16<0x81, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src2), - "lock\n\t" - "add{w}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_ADD32mi : Ii32<0x81, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src2), - "lock\n\t" - "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_ADD64mi32 : RIi32<0x81, MRM0m, (outs), - (ins i64mem:$dst, i64i32imm :$src2), - "lock\n\t" - "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK; - -def LOCK_ADD16mi8 : Ii8<0x83, MRM0m, (outs), (ins i16mem:$dst, i16i8imm :$src2), - "lock\n\t" - "add{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK; -def LOCK_ADD32mi8 : Ii8<0x83, MRM0m, (outs), (ins i32mem:$dst, i32i8imm :$src2), - "lock\n\t" - "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_ADD64mi8 : RIi8<0x83, MRM0m, (outs), - (ins i64mem:$dst, i64i8imm :$src2), - "lock\n\t" - "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK; - -def LOCK_SUB8mr : I<0x28, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src2), - "lock\n\t" - "sub{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_SUB16mr : I<0x29, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), - "lock\n\t" - "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK; -def LOCK_SUB32mr : I<0x29, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), - "lock\n\t" - "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_SUB64mr : RI<0x29, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), - "lock\n\t" - "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK; +def #NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 }, + MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2), + !strconcat("lock\n\t", mnemonic, "{b}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; +def #NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), + !strconcat("lock\n\t", mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + []>, OpSize, LOCK; +def #NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), + !strconcat("lock\n\t", mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; +def #NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), + !strconcat("lock\n\t", mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; + +def #NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 }, + ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2), + !strconcat("lock\n\t", mnemonic, "{b}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; + +def #NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2), + !strconcat("lock\n\t", mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; + +def #NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2), + !strconcat("lock\n\t", mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; + +def #NAME#64mi32 : RIi32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2), + !strconcat("lock\n\t", mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; + +def #NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2), + !strconcat("lock\n\t", mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; +def #NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2), + !strconcat("lock\n\t", mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; +def #NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2), + !strconcat("lock\n\t", mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; +} -def LOCK_SUB8mi : Ii8<0x80, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src2), - "lock\n\t" - "sub{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_SUB16mi : Ii16<0x81, MRM5m, (outs), (ins i16mem:$dst, i16imm:$src2), - "lock\n\t" - "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK; -def LOCK_SUB32mi : Ii32<0x81, MRM5m, (outs), (ins i32mem:$dst, i32imm:$src2), - "lock\n\t" - "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_SUB64mi32 : RIi32<0x81, MRM5m, (outs), - (ins i64mem:$dst, i64i32imm:$src2), - "lock\n\t" - "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK; +} +defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, "add">; +defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, "sub">; +defm LOCK_OR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, "or">; +defm LOCK_AND : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM4m, "and">; +defm LOCK_XOR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM6m, "xor">; -def LOCK_SUB16mi8 : Ii8<0x83, MRM5m, (outs), (ins i16mem:$dst, i16i8imm :$src2), - "lock\n\t" - "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK; -def LOCK_SUB32mi8 : Ii8<0x83, MRM5m, (outs), (ins i32mem:$dst, i32i8imm :$src2), - "lock\n\t" - "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_SUB64mi8 : RIi8<0x83, MRM5m, (outs), - (ins i64mem:$dst, i64i8imm :$src2), - "lock\n\t" - "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK; +// Optimized codegen when the non-memory output is not used. +let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in { def LOCK_INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "lock\n\t" @@ -960,7 +997,8 @@ def : Pat<(extloadi64i32 addr:$src), // anyext. Define these to do an explicit zero-extend to // avoid partial-register updates. -def : Pat<(i16 (anyext GR8 :$src)), (MOVZX16rr8 GR8 :$src)>; +def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG + (MOVZX32rr8 GR8 :$src), sub_16bit)>; def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>; // Except for i16 -> i32 since isel expect i16 ops to be promoted to i32. @@ -1127,9 +1165,9 @@ def : Pat<(and GR32:$src1, 0xff), Requires<[In32BitMode]>; // r & (2^8-1) ==> movz def : Pat<(and GR16:$src1, 0xff), - (MOVZX16rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src1, - GR16_ABCD)), - sub_8bit))>, + (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG + (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)), + sub_16bit)>, Requires<[In32BitMode]>; // r & (2^32-1) ==> movz @@ -1147,7 +1185,8 @@ def : Pat<(and GR32:$src1, 0xff), Requires<[In64BitMode]>; // r & (2^8-1) ==> movz def : Pat<(and GR16:$src1, 0xff), - (MOVZX16rr8 (i8 (EXTRACT_SUBREG GR16:$src1, sub_8bit)))>, + (EXTRACT_SUBREG (MOVZX32rr8 (i8 + (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>, Requires<[In64BitMode]>; @@ -1159,10 +1198,11 @@ def : Pat<(sext_inreg GR32:$src, i8), GR32_ABCD)), sub_8bit))>, Requires<[In32BitMode]>; + def : Pat<(sext_inreg GR16:$src, i8), - (MOVSX16rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, - GR16_ABCD)), - sub_8bit))>, + (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG + (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))), + sub_16bit)>, Requires<[In32BitMode]>; def : Pat<(sext_inreg GR64:$src, i32), @@ -1175,9 +1215,19 @@ def : Pat<(sext_inreg GR32:$src, i8), (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>, Requires<[In64BitMode]>; def : Pat<(sext_inreg GR16:$src, i8), - (MOVSX16rr8 (i8 (EXTRACT_SUBREG GR16:$src, sub_8bit)))>, + (EXTRACT_SUBREG (MOVSX32rr8 + (EXTRACT_SUBREG GR16:$src, sub_8bit)), sub_16bit)>, Requires<[In64BitMode]>; +// sext, sext_load, zext, zext_load +def: Pat<(i16 (sext GR8:$src)), + (EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>; +def: Pat<(sextloadi16i8 addr:$src), + (EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>; +def: Pat<(i16 (zext GR8:$src)), + (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>; +def: Pat<(zextloadi16i8 addr:$src), + (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>; // trunc patterns def : Pat<(i16 (trunc GR32:$src)), @@ -1474,12 +1524,6 @@ def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2), def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2), (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>; -// Optimize multiply by 2 with EFLAGS result. -let AddedComplexity = 2 in { -def : Pat<(X86smul_flag GR16:$src1, 2), (ADD16rr GR16:$src1, GR16:$src1)>; -def : Pat<(X86smul_flag GR32:$src1, 2), (ADD32rr GR32:$src1, GR32:$src1)>; -} - // Patterns for nodes that do not produce flags, for instructions that do. // addition diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td index 867c0f8..2e1d523 100644 --- a/lib/Target/X86/X86InstrExtension.td +++ b/lib/Target/X86/X86InstrExtension.td @@ -38,22 +38,11 @@ let neverHasSideEffects = 1 in { // Sign/Zero extenders -// Use movsbl intead of movsbw; we don't care about the high 16 bits -// of the register here. This has a smaller encoding and avoids a -// partial-register update. Actual movsbw included for the disassembler. -def MOVSX16rr8W : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), - "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; -def MOVSX16rm8W : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), - "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; - -// FIXME: Use a pat pattern or define a syntax here. -let isCodeGenOnly=1 in { -def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8 :$src), - "", [(set GR16:$dst, (sext GR8:$src))]>, TB; -def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem :$src), - "", [(set GR16:$dst, (sextloadi16i8 addr:$src))]>, TB; -} -def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src), +def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), + "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; +def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), + "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; +def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src), "movs{bl|x}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (sext GR8:$src))]>, TB; def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src), @@ -66,20 +55,10 @@ def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), "movs{wl|x}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (sextloadi32i16 addr:$src))]>, TB; -// Use movzbl intead of movzbw; we don't care about the high 16 bits -// of the register here. This has a smaller encoding and avoids a -// partial-register update. Actual movzbw included for the disassembler. -def MOVZX16rr8W : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), - "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; -def MOVZX16rm8W : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), - "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; -// FIXME: Use a pat pattern or define a syntax here. -let isCodeGenOnly=1 in { -def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8 :$src), - "", [(set GR16:$dst, (zext GR8:$src))]>, TB; -def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem :$src), - "", [(set GR16:$dst, (zextloadi16i8 addr:$src))]>, TB; -} +def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), + "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; +def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), + "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src), "movz{bl|x}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (zext GR8:$src))]>, TB; diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 3cbfac1..7c9a9f7 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -38,8 +38,11 @@ def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp, def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>; def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>; def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>; +def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>; def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; +def X86cmpss : SDNode<"X86ISD::FSETCCss", SDTX86Cmpss>; +def X86cmpsd : SDNode<"X86ISD::FSETCCsd", SDTX86Cmpsd>; def X86pshufb : SDNode<"X86ISD::PSHUFB", SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 83f0260..e2016eb 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -2015,62 +2015,48 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, bool isStackAligned, const TargetMachine &TM, bool load) { - switch (RC->getID()) { + switch (RC->getSize()) { default: - llvm_unreachable("Unknown regclass"); - case X86::GR64RegClassID: - case X86::GR64_ABCDRegClassID: - case X86::GR64_NOREXRegClassID: - case X86::GR64_NOREX_NOSPRegClassID: - case X86::GR64_NOSPRegClassID: - case X86::GR64_TCRegClassID: - case X86::GR64_TCW64RegClassID: - return load ? X86::MOV64rm : X86::MOV64mr; - case X86::GR32RegClassID: - case X86::GR32_ABCDRegClassID: - case X86::GR32_ADRegClassID: - case X86::GR32_NOREXRegClassID: - case X86::GR32_NOSPRegClassID: - case X86::GR32_TCRegClassID: - return load ? X86::MOV32rm : X86::MOV32mr; - case X86::GR16RegClassID: - case X86::GR16_ABCDRegClassID: - case X86::GR16_NOREXRegClassID: - return load ? X86::MOV16rm : X86::MOV16mr; - case X86::GR8RegClassID: - // Copying to or from a physical H register on x86-64 requires a NOREX - // move. Otherwise use a normal move. - if (isHReg(Reg) && - TM.getSubtarget<X86Subtarget>().is64Bit()) - return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX; - else - return load ? X86::MOV8rm : X86::MOV8mr; - case X86::GR8_ABCD_LRegClassID: - case X86::GR8_NOREXRegClassID: - return load ? X86::MOV8rm :X86::MOV8mr; - case X86::GR8_ABCD_HRegClassID: + llvm_unreachable("Unknown spill size"); + case 1: + assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass"); if (TM.getSubtarget<X86Subtarget>().is64Bit()) - return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX; - else - return load ? X86::MOV8rm : X86::MOV8mr; - case X86::RFP80RegClassID: + // Copying to or from a physical H register on x86-64 requires a NOREX + // move. Otherwise use a normal move. + if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC)) + return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX; + return load ? X86::MOV8rm : X86::MOV8mr; + case 2: + assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass"); + return load ? X86::MOV16rm : X86::MOV16mr; + case 4: + if (X86::GR32RegClass.hasSubClassEq(RC)) + return load ? X86::MOV32rm : X86::MOV32mr; + if (X86::FR32RegClass.hasSubClassEq(RC)) + return load ? X86::MOVSSrm : X86::MOVSSmr; + if (X86::RFP32RegClass.hasSubClassEq(RC)) + return load ? X86::LD_Fp32m : X86::ST_Fp32m; + llvm_unreachable("Unknown 4-byte regclass"); + case 8: + if (X86::GR64RegClass.hasSubClassEq(RC)) + return load ? X86::MOV64rm : X86::MOV64mr; + if (X86::FR64RegClass.hasSubClassEq(RC)) + return load ? X86::MOVSDrm : X86::MOVSDmr; + if (X86::VR64RegClass.hasSubClassEq(RC)) + return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr; + if (X86::RFP64RegClass.hasSubClassEq(RC)) + return load ? X86::LD_Fp64m : X86::ST_Fp64m; + llvm_unreachable("Unknown 8-byte regclass"); + case 10: + assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass"); return load ? X86::LD_Fp80m : X86::ST_FpP80m; - case X86::RFP64RegClassID: - return load ? X86::LD_Fp64m : X86::ST_Fp64m; - case X86::RFP32RegClassID: - return load ? X86::LD_Fp32m : X86::ST_Fp32m; - case X86::FR32RegClassID: - return load ? X86::MOVSSrm : X86::MOVSSmr; - case X86::FR64RegClassID: - return load ? X86::MOVSDrm : X86::MOVSDmr; - case X86::VR128RegClassID: + case 16: + assert(X86::VR128RegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass"); // If stack is realigned we can use aligned stores. if (isStackAligned) return load ? X86::MOVAPSrm : X86::MOVAPSmr; else return load ? X86::MOVUPSrm : X86::MOVUPSmr; - case X86::VR64RegClassID: - return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr; } } @@ -2434,7 +2420,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, Alignment = 4; break; default: - llvm_unreachable("Don't know how to fold this instruction!"); + return 0; } if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { unsigned NewOpc = 0; diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 8da68b5..d895023 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -449,7 +449,6 @@ namespace X86II { SSEDomainShift = SegOvrShift + 2, OpcodeShift = SSEDomainShift + 2, - OpcodeMask = 0xFFULL << OpcodeShift, //===------------------------------------------------------------------===// /// VEX - The opcode prefix used by AVX instructions diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 03a0b0c..8cab808 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -23,6 +23,9 @@ def SDTIntShiftDOp: SDTypeProfile<1, 3, def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>; +def SDTX86Cmpsd : SDTypeProfile<1, 3, [SDTCisVT<0, f64>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; +def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; + def SDTX86Cmov : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; @@ -1485,6 +1488,7 @@ defm : FpUnaryAlias<"fucompi", UCOM_FIPr>; // solely because gas supports it. def : InstAlias<"faddp %st(0), $op", (ADD_FPrST0 RST:$op), 0>; def : InstAlias<"fmulp %st(0), $op", (MUL_FPrST0 RST:$op)>; +def : InstAlias<"fsubp %st(0), $op", (SUBR_FPrST0 RST:$op)>; def : InstAlias<"fsubrp %st(0), $op", (SUB_FPrST0 RST:$op)>; def : InstAlias<"fdivp %st(0), $op", (DIVR_FPrST0 RST:$op)>; def : InstAlias<"fdivrp %st(0), $op", (DIV_FPrST0 RST:$op)>; @@ -1546,8 +1550,8 @@ def : InstAlias<"movq $src, $dst", def : InstAlias<"movsd", (MOVSD)>; // movsx aliases -def : InstAlias<"movsx $src, $dst", (MOVSX16rr8W GR16:$dst, GR8:$src), 0>; -def : InstAlias<"movsx $src, $dst", (MOVSX16rm8W GR16:$dst, i8mem:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>; def : InstAlias<"movsx $src, $dst", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>; def : InstAlias<"movsx $src, $dst", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>; def : InstAlias<"movsx $src, $dst", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>; @@ -1555,8 +1559,8 @@ def : InstAlias<"movsx $src, $dst", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>; def : InstAlias<"movsx $src, $dst", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>; // movzx aliases -def : InstAlias<"movzx $src, $dst", (MOVZX16rr8W GR16:$dst, GR8:$src), 0>; -def : InstAlias<"movzx $src, $dst", (MOVZX16rm8W GR16:$dst, i8mem:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>; def : InstAlias<"movzx $src, $dst", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>; def : InstAlias<"movzx $src, $dst", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>; def : InstAlias<"movzx $src, $dst", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>; diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index bb2165a..b2d9fca 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -285,7 +285,7 @@ let Constraints = "$src1 = $dst" in defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand, 1>; defm MMX_POR : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por, 1>; defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor, 1>; -defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn, 1>; +defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn>; // Shift Instructions defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw", diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index cde3f6b..b64c03a 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -1056,13 +1056,37 @@ let neverHasSideEffects = 1 in { XD, VEX_4V; } +let Constraints = "$src1 = $dst" in { +def CMPSSrr : SIi8<0xC2, MRMSrcReg, + (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, SSECC:$cc), + "cmp${cc}ss\t{$src2, $dst|$dst, $src2}", + [(set FR32:$dst, (X86cmpss (f32 FR32:$src1), FR32:$src2, imm:$cc))]>, XS; +def CMPSSrm : SIi8<0xC2, MRMSrcMem, + (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, SSECC:$cc), + "cmp${cc}ss\t{$src2, $dst|$dst, $src2}", + [(set FR32:$dst, (X86cmpss (f32 FR32:$src1), (loadf32 addr:$src2), imm:$cc))]>, XS; +def CMPSDrr : SIi8<0xC2, MRMSrcReg, + (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, SSECC:$cc), + "cmp${cc}sd\t{$src2, $dst|$dst, $src2}", + [(set FR64:$dst, (X86cmpsd (f64 FR64:$src1), FR64:$src2, imm:$cc))]>, XD; +def CMPSDrm : SIi8<0xC2, MRMSrcMem, + (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, SSECC:$cc), + "cmp${cc}sd\t{$src2, $dst|$dst, $src2}", + [(set FR64:$dst, (X86cmpsd (f64 FR64:$src1), (loadf64 addr:$src2), imm:$cc))]>, XD; +} let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in { - defm CMPSS : sse12_cmp_scalar<FR32, f32mem, - "cmp${cc}ss\t{$src, $dst|$dst, $src}", - "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}">, XS; - defm CMPSD : sse12_cmp_scalar<FR64, f64mem, - "cmp${cc}sd\t{$src, $dst|$dst, $src}", - "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}">, XD; +def CMPSSrr_alt : SIi8<0xC2, MRMSrcReg, + (outs FR32:$dst), (ins FR32:$src1, FR32:$src, i8imm:$src2), + "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XS; +def CMPSSrm_alt : SIi8<0xC2, MRMSrcMem, + (outs FR32:$dst), (ins FR32:$src1, f32mem:$src, i8imm:$src2), + "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XS; +def CMPSDrr_alt : SIi8<0xC2, MRMSrcReg, + (outs FR64:$dst), (ins FR64:$src1, FR64:$src, i8imm:$src2), + "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XD; +def CMPSDrm_alt : SIi8<0xC2, MRMSrcMem, + (outs FR64:$dst), (ins FR64:$src1, f64mem:$src, i8imm:$src2), + "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XD; } multiclass sse12_cmp_scalar_int<RegisterClass RC, X86MemOperand x86memop, @@ -1327,11 +1351,6 @@ multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm, } // Mask creation -defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps", - SSEPackedSingle>, TB; -defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd", - SSEPackedDouble>, TB, OpSize; - defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps", SSEPackedSingle>, VEX; defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, @@ -1342,6 +1361,24 @@ defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256, defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256, "movmskpd", SSEPackedDouble>, OpSize, VEX; +defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps", + SSEPackedSingle>, TB; +defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd", + SSEPackedDouble>, TB, OpSize; + +// X86fgetsign +def MOVMSKPDrr32_alt : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins FR64:$src), + "movmskpd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (X86fgetsign FR64:$src))], SSEPackedDouble>, TB, OpSize; +def MOVMSKPDrr64_alt : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins FR64:$src), + "movmskpd\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (X86fgetsign FR64:$src))], SSEPackedDouble>, TB, OpSize; +def MOVMSKPSrr32_alt : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins FR32:$src), + "movmskps\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (X86fgetsign FR32:$src))], SSEPackedSingle>, TB; +def MOVMSKPSrr64_alt : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins FR32:$src), + "movmskps\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (X86fgetsign FR32:$src))], SSEPackedSingle>, TB; // Assembler Only def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), @@ -1875,21 +1912,6 @@ defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss>, // SSE 1 & 2 - Non-temporal stores //===----------------------------------------------------------------------===// -def VMOVNTPSmr_Int : VPSI<0x2B, MRMDestMem, (outs), - (ins i128mem:$dst, VR128:$src), - "movntps\t{$src, $dst|$dst, $src}", - [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>, VEX; -def VMOVNTPDmr_Int : VPDI<0x2B, MRMDestMem, (outs), - (ins i128mem:$dst, VR128:$src), - "movntpd\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>, VEX; - -let ExeDomain = SSEPackedInt in - def VMOVNTDQmr_Int : VPDI<0xE7, MRMDestMem, (outs), - (ins f128mem:$dst, VR128:$src), - "movntdq\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>, VEX; - let AddedComplexity = 400 in { // Prefer non-temporal versions def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), @@ -1906,12 +1928,16 @@ let AddedComplexity = 400 in { // Prefer non-temporal versions "movntdq\t{$src, $dst|$dst, $src}", [(alignednontemporalstore (v2f64 VR128:$src), addr:$dst)]>, VEX; + let ExeDomain = SSEPackedInt in - def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), - (ins f128mem:$dst, VR128:$src), - "movntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v4f32 VR128:$src), - addr:$dst)]>, VEX; + def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f32 VR128:$src), + addr:$dst)]>, VEX; + + def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst), + (VMOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasAVX]>; def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), @@ -1943,18 +1969,6 @@ def : Pat<(int_x86_avx_movnt_pd_256 addr:$dst, VR256:$src), def : Pat<(int_x86_avx_movnt_ps_256 addr:$dst, VR256:$src), (VMOVNTPSYmr addr:$dst, VR256:$src)>; -def MOVNTPSmr_Int : PSI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), - "movntps\t{$src, $dst|$dst, $src}", - [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>; -def MOVNTPDmr_Int : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), - "movntpd\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>; - -let ExeDomain = SSEPackedInt in -def MOVNTDQmr_Int : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), - "movntdq\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>; - let AddedComplexity = 400 in { // Prefer non-temporal versions def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntps\t{$src, $dst|$dst, $src}", @@ -1972,22 +1986,19 @@ def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntdq\t{$src, $dst|$dst, $src}", [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>; +def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst), + (MOVNTDQmr addr:$dst, VR128:$src)>; + // There is no AVX form for instructions below this point def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), "movnti\t{$src, $dst|$dst, $src}", [(nontemporalstore (i32 GR32:$src), addr:$dst)]>, TB, Requires<[HasSSE2]>; - def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), "movnti\t{$src, $dst|$dst, $src}", [(nontemporalstore (i64 GR64:$src), addr:$dst)]>, TB, Requires<[HasSSE2]>; - } -def MOVNTImr_Int : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), - "movnti\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>, - TB, Requires<[HasSSE2]>; //===----------------------------------------------------------------------===// // SSE 1 & 2 - Misc Instructions (No AVX form) @@ -4733,14 +4744,14 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in { def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, - "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"), + "\t{$src2, $dst|$dst, $src2}"), [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>, OpSize; def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), !strconcat(OpcodeStr, - "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"), + "\t{$src2, $dst|$dst, $src2}"), [(set VR128:$dst, (IntId VR128:$src1, (bitconvert (memopv16i8 addr:$src2)), XMM0))]>, OpSize; @@ -4961,66 +4972,66 @@ defm PCMPESTRIZ : SS42AI_pcmpestri<int_x86_sse42_pcmpestriz128>; // This set of instructions are only rm, the only difference is the size // of r and m. let Constraints = "$src1 = $dst" in { - def CRC32m8 : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst), + def CRC32r32m8 : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i8mem:$src2), "crc32{b} \t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, - (int_x86_sse42_crc32_8 GR32:$src1, + (int_x86_sse42_crc32_32_8 GR32:$src1, (load addr:$src2)))]>; - def CRC32r8 : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst), + def CRC32r32r8 : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR8:$src2), "crc32{b} \t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, - (int_x86_sse42_crc32_8 GR32:$src1, GR8:$src2))]>; - def CRC32m16 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst), + (int_x86_sse42_crc32_32_8 GR32:$src1, GR8:$src2))]>; + def CRC32r32m16 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i16mem:$src2), "crc32{w} \t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, - (int_x86_sse42_crc32_16 GR32:$src1, + (int_x86_sse42_crc32_32_16 GR32:$src1, (load addr:$src2)))]>, OpSize; - def CRC32r16 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst), + def CRC32r32r16 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR16:$src2), "crc32{w} \t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, - (int_x86_sse42_crc32_16 GR32:$src1, GR16:$src2))]>, + (int_x86_sse42_crc32_32_16 GR32:$src1, GR16:$src2))]>, OpSize; - def CRC32m32 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst), + def CRC32r32m32 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), "crc32{l} \t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, - (int_x86_sse42_crc32_32 GR32:$src1, + (int_x86_sse42_crc32_32_32 GR32:$src1, (load addr:$src2)))]>; - def CRC32r32 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst), + def CRC32r32r32 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), "crc32{l} \t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, - (int_x86_sse42_crc32_32 GR32:$src1, GR32:$src2))]>; - def CRC64m8 : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst), + (int_x86_sse42_crc32_32_32 GR32:$src1, GR32:$src2))]>; + def CRC32r64m8 : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i8mem:$src2), "crc32{b} \t{$src2, $src1|$src1, $src2}", [(set GR64:$dst, - (int_x86_sse42_crc64_8 GR64:$src1, + (int_x86_sse42_crc32_64_8 GR64:$src1, (load addr:$src2)))]>, REX_W; - def CRC64r8 : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst), + def CRC32r64r8 : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR8:$src2), "crc32{b} \t{$src2, $src1|$src1, $src2}", [(set GR64:$dst, - (int_x86_sse42_crc64_8 GR64:$src1, GR8:$src2))]>, + (int_x86_sse42_crc32_64_8 GR64:$src1, GR8:$src2))]>, REX_W; - def CRC64m64 : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst), + def CRC32r64m64 : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2), "crc32{q} \t{$src2, $src1|$src1, $src2}", [(set GR64:$dst, - (int_x86_sse42_crc64_64 GR64:$src1, + (int_x86_sse42_crc32_64_64 GR64:$src1, (load addr:$src2)))]>, REX_W; - def CRC64r64 : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst), + def CRC32r64r64 : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "crc32{q} \t{$src2, $src1|$src1, $src2}", [(set GR64:$dst, - (int_x86_sse42_crc64_64 GR64:$src1, GR64:$src2))]>, + (int_x86_sse42_crc32_64_64 GR64:$src1, GR64:$src2))]>, REX_W; } diff --git a/lib/Target/X86/X86MCAsmInfo.cpp b/lib/Target/X86/X86MCAsmInfo.cpp index 83bba52..2e1ec63 100644 --- a/lib/Target/X86/X86MCAsmInfo.cpp +++ b/lib/Target/X86/X86MCAsmInfo.cpp @@ -108,8 +108,6 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) { // Exceptions handling ExceptionsType = ExceptionHandling::DwarfCFI; - DwarfRequiresFrameSection = false; - // OpenBSD has buggy support for .quad in 32-bit mode, just split into two // .words. if (T.getOS() == Triple::OpenBSD && T.getArch() == Triple::x86) diff --git a/lib/Target/X86/X86MCCodeEmitter.cpp b/lib/Target/X86/X86MCCodeEmitter.cpp index f195a67..55aceba 100644 --- a/lib/Target/X86/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/X86MCCodeEmitter.cpp @@ -1015,7 +1015,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, } else { unsigned FixupKind; // FIXME: Is there a better way to know that we need a signed relocation? - if (MI.getOpcode() == X86::MOV64ri32 || + if (MI.getOpcode() == X86::ADD64ri32 || + MI.getOpcode() == X86::MOV64ri32 || MI.getOpcode() == X86::MOV64mi32 || MI.getOpcode() == X86::PUSH64i32) FixupKind = X86::reloc_signed_4byte; diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index cbe6db2..793156f 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -355,10 +355,6 @@ ReSimplify: assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 && "LEA has segment specified!"); break; - case X86::MOVZX16rr8: LowerSubReg32_Op0(OutMI, X86::MOVZX32rr8); break; - case X86::MOVZX16rm8: LowerSubReg32_Op0(OutMI, X86::MOVZX32rm8); break; - case X86::MOVSX16rr8: LowerSubReg32_Op0(OutMI, X86::MOVSX32rr8); break; - case X86::MOVSX16rm8: LowerSubReg32_Op0(OutMI, X86::MOVSX32rm8); break; case X86::MOVZX64rr32: LowerSubReg32_Op0(OutMI, X86::MOV32rr); break; case X86::MOVZX64rm32: LowerSubReg32_Op0(OutMI, X86::MOV32rm); break; case X86::MOV64ri64i32: LowerSubReg32_Op0(OutMI, X86::MOV32ri); break; diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 37fb0fe..1ad6203 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -73,29 +73,61 @@ X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm, } } -/// getDwarfRegNum - This function maps LLVM register identifiers to the DWARF -/// specific numbering, used in debug info and exception tables. -int X86RegisterInfo::getDwarfRegNum(unsigned RegNo, bool isEH) const { - const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); - unsigned Flavour = DWARFFlavour::X86_64; - +static unsigned getFlavour(const X86Subtarget *Subtarget, bool isEH) { if (!Subtarget->is64Bit()) { if (Subtarget->isTargetDarwin()) { if (isEH) - Flavour = DWARFFlavour::X86_32_DarwinEH; + return DWARFFlavour::X86_32_DarwinEH; else - Flavour = DWARFFlavour::X86_32_Generic; + return DWARFFlavour::X86_32_Generic; } else if (Subtarget->isTargetCygMing()) { // Unsupported by now, just quick fallback - Flavour = DWARFFlavour::X86_32_Generic; + return DWARFFlavour::X86_32_Generic; } else { - Flavour = DWARFFlavour::X86_32_Generic; + return DWARFFlavour::X86_32_Generic; } } + return DWARFFlavour::X86_64; +} + +/// getDwarfRegNum - This function maps LLVM register identifiers to the DWARF +/// specific numbering, used in debug info and exception tables. +int X86RegisterInfo::getDwarfRegNum(unsigned RegNo, bool isEH) const { + const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); + unsigned Flavour = getFlavour(Subtarget, isEH); return X86GenRegisterInfo::getDwarfRegNumFull(RegNo, Flavour); } +/// getLLVMRegNum - This function maps DWARF register numbers to LLVM register. +int X86RegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const { + const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); + unsigned Flavour = getFlavour(Subtarget, isEH); + + return X86GenRegisterInfo::getLLVMRegNumFull(DwarfRegNo, Flavour); +} + +int +X86RegisterInfo::getSEHRegNum(unsigned i) const { + int reg = getX86RegNum(i); + switch (i) { + case X86::R8: case X86::R8D: case X86::R8W: case X86::R8B: + case X86::R9: case X86::R9D: case X86::R9W: case X86::R9B: + case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B: + case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B: + case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B: + case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B: + case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B: + case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B: + case X86::XMM8: case X86::XMM9: case X86::XMM10: case X86::XMM11: + case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15: + case X86::YMM8: case X86::YMM9: case X86::YMM10: case X86::YMM11: + case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15: + reg += 8; + } + return reg; +} + /// getX86RegNum - This function maps LLVM register identifiers to their X86 /// specific numbering, which is used in various places encoding instructions. unsigned X86RegisterInfo::getX86RegNum(unsigned RegNo) { @@ -229,19 +261,13 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, } break; case X86::sub_8bit_hi: - if (B == &X86::GR8_ABCD_HRegClass) { - if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass || - A == &X86::GR64_NOREXRegClass || - A == &X86::GR64_NOSPRegClass || - A == &X86::GR64_NOREX_NOSPRegClass) - return &X86::GR64_ABCDRegClass; - else if (A == &X86::GR32RegClass || A == &X86::GR32_ABCDRegClass || - A == &X86::GR32_NOREXRegClass || A == &X86::GR32_NOSPRegClass) - return &X86::GR32_ABCDRegClass; - else if (A == &X86::GR16RegClass || A == &X86::GR16_ABCDRegClass || - A == &X86::GR16_NOREXRegClass) - return &X86::GR16_ABCDRegClass; - } + if (B->hasSubClassEq(&X86::GR8_ABCD_HRegClass)) + switch (A->getSize()) { + case 2: return getCommonSubClass(A, &X86::GR16_ABCDRegClass); + case 4: return getCommonSubClass(A, &X86::GR32_ABCDRegClass); + case 8: return getCommonSubClass(A, &X86::GR64_ABCDRegClass); + default: return 0; + } break; case X86::sub_16bit: if (B == &X86::GR16RegClass) { @@ -285,9 +311,16 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, A == &X86::GR64_NOREX_NOSPRegClass) return &X86::GR64_ABCDRegClass; } else if (B == &X86::GR32_NOREXRegClass) { + if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass) + return &X86::GR64_NOREXRegClass; + else if (A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass) + return &X86::GR64_NOREX_NOSPRegClass; + else if (A == &X86::GR64_ABCDRegClass) + return &X86::GR64_ABCDRegClass; + } else if (B == &X86::GR32_NOREX_NOSPRegClass) { if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass || A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass) - return &X86::GR64_NOREXRegClass; + return &X86::GR64_NOREX_NOSPRegClass; else if (A == &X86::GR64_ABCDRegClass) return &X86::GR64_ABCDRegClass; } @@ -473,6 +506,34 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(X86::ST5); Reserved.set(X86::ST6); Reserved.set(X86::ST7); + + // Mark the segment registers as reserved. + Reserved.set(X86::CS); + Reserved.set(X86::SS); + Reserved.set(X86::DS); + Reserved.set(X86::ES); + Reserved.set(X86::FS); + Reserved.set(X86::GS); + + // Reserve the registers that only exist in 64-bit mode. + if (!Is64Bit) { + for (unsigned n = 0; n != 8; ++n) { + const unsigned GPR64[] = { + X86::R8, X86::R9, X86::R10, X86::R11, + X86::R12, X86::R13, X86::R14, X86::R15 + }; + for (const unsigned *AI = getOverlaps(GPR64[n]); unsigned Reg = *AI; + ++AI) + Reserved.set(Reg); + + // XMM8, XMM9, ... + assert(X86::XMM15 == X86::XMM8+7); + for (const unsigned *AI = getOverlaps(X86::XMM8 + n); unsigned Reg = *AI; + ++AI) + Reserved.set(Reg); + } + } + return Reserved; } diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index 9970c52..dd3d3dc 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -80,6 +80,10 @@ public: /// getDwarfRegNum - allows modification of X86GenRegisterInfo::getDwarfRegNum /// (created by TableGen) for target dependencies. int getDwarfRegNum(unsigned RegNum, bool isEH) const; + int getLLVMRegNum(unsigned RegNum, bool isEH) const; + + // FIXME: This should be tablegen'd like getDwarfRegNum is + int getSEHRegNum(unsigned i) const; /// Code Generation virtual methods... /// diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index fd7a247..f1d149c 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -41,82 +41,82 @@ let Namespace = "X86" in { // 8-bit registers // Low registers - def AL : Register<"al">, DwarfRegNum<[0, 0, 0]>; - def DL : Register<"dl">, DwarfRegNum<[1, 2, 2]>; - def CL : Register<"cl">, DwarfRegNum<[2, 1, 1]>; - def BL : Register<"bl">, DwarfRegNum<[3, 3, 3]>; + def AL : Register<"al">; + def DL : Register<"dl">; + def CL : Register<"cl">; + def BL : Register<"bl">; // X86-64 only, requires REX. let CostPerUse = 1 in { - def SIL : Register<"sil">, DwarfRegNum<[4, 6, 6]>; - def DIL : Register<"dil">, DwarfRegNum<[5, 7, 7]>; - def BPL : Register<"bpl">, DwarfRegNum<[6, 4, 5]>; - def SPL : Register<"spl">, DwarfRegNum<[7, 5, 4]>; - def R8B : Register<"r8b">, DwarfRegNum<[8, -2, -2]>; - def R9B : Register<"r9b">, DwarfRegNum<[9, -2, -2]>; - def R10B : Register<"r10b">, DwarfRegNum<[10, -2, -2]>; - def R11B : Register<"r11b">, DwarfRegNum<[11, -2, -2]>; - def R12B : Register<"r12b">, DwarfRegNum<[12, -2, -2]>; - def R13B : Register<"r13b">, DwarfRegNum<[13, -2, -2]>; - def R14B : Register<"r14b">, DwarfRegNum<[14, -2, -2]>; - def R15B : Register<"r15b">, DwarfRegNum<[15, -2, -2]>; + def SIL : Register<"sil">; + def DIL : Register<"dil">; + def BPL : Register<"bpl">; + def SPL : Register<"spl">; + def R8B : Register<"r8b">; + def R9B : Register<"r9b">; + def R10B : Register<"r10b">; + def R11B : Register<"r11b">; + def R12B : Register<"r12b">; + def R13B : Register<"r13b">; + def R14B : Register<"r14b">; + def R15B : Register<"r15b">; } // High registers. On x86-64, these cannot be used in any instruction // with a REX prefix. - def AH : Register<"ah">, DwarfRegNum<[0, 0, 0]>; - def DH : Register<"dh">, DwarfRegNum<[1, 2, 2]>; - def CH : Register<"ch">, DwarfRegNum<[2, 1, 1]>; - def BH : Register<"bh">, DwarfRegNum<[3, 3, 3]>; + def AH : Register<"ah">; + def DH : Register<"dh">; + def CH : Register<"ch">; + def BH : Register<"bh">; // 16-bit registers let SubRegIndices = [sub_8bit, sub_8bit_hi] in { - def AX : RegisterWithSubRegs<"ax", [AL,AH]>, DwarfRegNum<[0, 0, 0]>; - def DX : RegisterWithSubRegs<"dx", [DL,DH]>, DwarfRegNum<[1, 2, 2]>; - def CX : RegisterWithSubRegs<"cx", [CL,CH]>, DwarfRegNum<[2, 1, 1]>; - def BX : RegisterWithSubRegs<"bx", [BL,BH]>, DwarfRegNum<[3, 3, 3]>; + def AX : RegisterWithSubRegs<"ax", [AL,AH]>; + def DX : RegisterWithSubRegs<"dx", [DL,DH]>; + def CX : RegisterWithSubRegs<"cx", [CL,CH]>; + def BX : RegisterWithSubRegs<"bx", [BL,BH]>; } let SubRegIndices = [sub_8bit] in { - def SI : RegisterWithSubRegs<"si", [SIL]>, DwarfRegNum<[4, 6, 6]>; - def DI : RegisterWithSubRegs<"di", [DIL]>, DwarfRegNum<[5, 7, 7]>; - def BP : RegisterWithSubRegs<"bp", [BPL]>, DwarfRegNum<[6, 4, 5]>; - def SP : RegisterWithSubRegs<"sp", [SPL]>, DwarfRegNum<[7, 5, 4]>; + def SI : RegisterWithSubRegs<"si", [SIL]>; + def DI : RegisterWithSubRegs<"di", [DIL]>; + def BP : RegisterWithSubRegs<"bp", [BPL]>; + def SP : RegisterWithSubRegs<"sp", [SPL]>; } - def IP : Register<"ip">, DwarfRegNum<[16]>; + def IP : Register<"ip">; // X86-64 only, requires REX. let SubRegIndices = [sub_8bit], CostPerUse = 1 in { - def R8W : RegisterWithSubRegs<"r8w", [R8B]>, DwarfRegNum<[8, -2, -2]>; - def R9W : RegisterWithSubRegs<"r9w", [R9B]>, DwarfRegNum<[9, -2, -2]>; - def R10W : RegisterWithSubRegs<"r10w", [R10B]>, DwarfRegNum<[10, -2, -2]>; - def R11W : RegisterWithSubRegs<"r11w", [R11B]>, DwarfRegNum<[11, -2, -2]>; - def R12W : RegisterWithSubRegs<"r12w", [R12B]>, DwarfRegNum<[12, -2, -2]>; - def R13W : RegisterWithSubRegs<"r13w", [R13B]>, DwarfRegNum<[13, -2, -2]>; - def R14W : RegisterWithSubRegs<"r14w", [R14B]>, DwarfRegNum<[14, -2, -2]>; - def R15W : RegisterWithSubRegs<"r15w", [R15B]>, DwarfRegNum<[15, -2, -2]>; + def R8W : RegisterWithSubRegs<"r8w", [R8B]>; + def R9W : RegisterWithSubRegs<"r9w", [R9B]>; + def R10W : RegisterWithSubRegs<"r10w", [R10B]>; + def R11W : RegisterWithSubRegs<"r11w", [R11B]>; + def R12W : RegisterWithSubRegs<"r12w", [R12B]>; + def R13W : RegisterWithSubRegs<"r13w", [R13B]>; + def R14W : RegisterWithSubRegs<"r14w", [R14B]>; + def R15W : RegisterWithSubRegs<"r15w", [R15B]>; } // 32-bit registers let SubRegIndices = [sub_16bit] in { - def EAX : RegisterWithSubRegs<"eax", [AX]>, DwarfRegNum<[0, 0, 0]>; - def EDX : RegisterWithSubRegs<"edx", [DX]>, DwarfRegNum<[1, 2, 2]>; - def ECX : RegisterWithSubRegs<"ecx", [CX]>, DwarfRegNum<[2, 1, 1]>; - def EBX : RegisterWithSubRegs<"ebx", [BX]>, DwarfRegNum<[3, 3, 3]>; - def ESI : RegisterWithSubRegs<"esi", [SI]>, DwarfRegNum<[4, 6, 6]>; - def EDI : RegisterWithSubRegs<"edi", [DI]>, DwarfRegNum<[5, 7, 7]>; - def EBP : RegisterWithSubRegs<"ebp", [BP]>, DwarfRegNum<[6, 4, 5]>; - def ESP : RegisterWithSubRegs<"esp", [SP]>, DwarfRegNum<[7, 5, 4]>; - def EIP : RegisterWithSubRegs<"eip", [IP]>, DwarfRegNum<[16, 8, 8]>; + def EAX : RegisterWithSubRegs<"eax", [AX]>, DwarfRegNum<[-2, 0, 0]>; + def EDX : RegisterWithSubRegs<"edx", [DX]>, DwarfRegNum<[-2, 2, 2]>; + def ECX : RegisterWithSubRegs<"ecx", [CX]>, DwarfRegNum<[-2, 1, 1]>; + def EBX : RegisterWithSubRegs<"ebx", [BX]>, DwarfRegNum<[-2, 3, 3]>; + def ESI : RegisterWithSubRegs<"esi", [SI]>, DwarfRegNum<[-2, 6, 6]>; + def EDI : RegisterWithSubRegs<"edi", [DI]>, DwarfRegNum<[-2, 7, 7]>; + def EBP : RegisterWithSubRegs<"ebp", [BP]>, DwarfRegNum<[-2, 4, 5]>; + def ESP : RegisterWithSubRegs<"esp", [SP]>, DwarfRegNum<[-2, 5, 4]>; + def EIP : RegisterWithSubRegs<"eip", [IP]>, DwarfRegNum<[-2, 8, 8]>; // X86-64 only, requires REX let CostPerUse = 1 in { - def R8D : RegisterWithSubRegs<"r8d", [R8W]>, DwarfRegNum<[8, -2, -2]>; - def R9D : RegisterWithSubRegs<"r9d", [R9W]>, DwarfRegNum<[9, -2, -2]>; - def R10D : RegisterWithSubRegs<"r10d", [R10W]>, DwarfRegNum<[10, -2, -2]>; - def R11D : RegisterWithSubRegs<"r11d", [R11W]>, DwarfRegNum<[11, -2, -2]>; - def R12D : RegisterWithSubRegs<"r12d", [R12W]>, DwarfRegNum<[12, -2, -2]>; - def R13D : RegisterWithSubRegs<"r13d", [R13W]>, DwarfRegNum<[13, -2, -2]>; - def R14D : RegisterWithSubRegs<"r14d", [R14W]>, DwarfRegNum<[14, -2, -2]>; - def R15D : RegisterWithSubRegs<"r15d", [R15W]>, DwarfRegNum<[15, -2, -2]>; + def R8D : RegisterWithSubRegs<"r8d", [R8W]>; + def R9D : RegisterWithSubRegs<"r9d", [R9W]>; + def R10D : RegisterWithSubRegs<"r10d", [R10W]>; + def R11D : RegisterWithSubRegs<"r11d", [R11W]>; + def R12D : RegisterWithSubRegs<"r12d", [R12W]>; + def R13D : RegisterWithSubRegs<"r13d", [R13W]>; + def R14D : RegisterWithSubRegs<"r14d", [R14W]>; + def R15D : RegisterWithSubRegs<"r15d", [R15W]>; }} // 64-bit registers, X86-64 only @@ -188,22 +188,22 @@ let Namespace = "X86" in { // YMM Registers, used by AVX instructions let SubRegIndices = [sub_xmm] in { - def YMM0: RegisterWithSubRegs<"ymm0", [XMM0]>, DwarfRegNum<[17, 21, 21]>; - def YMM1: RegisterWithSubRegs<"ymm1", [XMM1]>, DwarfRegNum<[18, 22, 22]>; - def YMM2: RegisterWithSubRegs<"ymm2", [XMM2]>, DwarfRegNum<[19, 23, 23]>; - def YMM3: RegisterWithSubRegs<"ymm3", [XMM3]>, DwarfRegNum<[20, 24, 24]>; - def YMM4: RegisterWithSubRegs<"ymm4", [XMM4]>, DwarfRegNum<[21, 25, 25]>; - def YMM5: RegisterWithSubRegs<"ymm5", [XMM5]>, DwarfRegNum<[22, 26, 26]>; - def YMM6: RegisterWithSubRegs<"ymm6", [XMM6]>, DwarfRegNum<[23, 27, 27]>; - def YMM7: RegisterWithSubRegs<"ymm7", [XMM7]>, DwarfRegNum<[24, 28, 28]>; - def YMM8: RegisterWithSubRegs<"ymm8", [XMM8]>, DwarfRegNum<[25, -2, -2]>; - def YMM9: RegisterWithSubRegs<"ymm9", [XMM9]>, DwarfRegNum<[26, -2, -2]>; - def YMM10: RegisterWithSubRegs<"ymm10", [XMM10]>, DwarfRegNum<[27, -2, -2]>; - def YMM11: RegisterWithSubRegs<"ymm11", [XMM11]>, DwarfRegNum<[28, -2, -2]>; - def YMM12: RegisterWithSubRegs<"ymm12", [XMM12]>, DwarfRegNum<[29, -2, -2]>; - def YMM13: RegisterWithSubRegs<"ymm13", [XMM13]>, DwarfRegNum<[30, -2, -2]>; - def YMM14: RegisterWithSubRegs<"ymm14", [XMM14]>, DwarfRegNum<[31, -2, -2]>; - def YMM15: RegisterWithSubRegs<"ymm15", [XMM15]>, DwarfRegNum<[32, -2, -2]>; + def YMM0: RegisterWithSubRegs<"ymm0", [XMM0]>, DwarfRegAlias<XMM0>; + def YMM1: RegisterWithSubRegs<"ymm1", [XMM1]>, DwarfRegAlias<XMM1>; + def YMM2: RegisterWithSubRegs<"ymm2", [XMM2]>, DwarfRegAlias<XMM2>; + def YMM3: RegisterWithSubRegs<"ymm3", [XMM3]>, DwarfRegAlias<XMM3>; + def YMM4: RegisterWithSubRegs<"ymm4", [XMM4]>, DwarfRegAlias<XMM4>; + def YMM5: RegisterWithSubRegs<"ymm5", [XMM5]>, DwarfRegAlias<XMM5>; + def YMM6: RegisterWithSubRegs<"ymm6", [XMM6]>, DwarfRegAlias<XMM6>; + def YMM7: RegisterWithSubRegs<"ymm7", [XMM7]>, DwarfRegAlias<XMM7>; + def YMM8: RegisterWithSubRegs<"ymm8", [XMM8]>, DwarfRegAlias<XMM8>; + def YMM9: RegisterWithSubRegs<"ymm9", [XMM9]>, DwarfRegAlias<XMM9>; + def YMM10: RegisterWithSubRegs<"ymm10", [XMM10]>, DwarfRegAlias<XMM10>; + def YMM11: RegisterWithSubRegs<"ymm11", [XMM11]>, DwarfRegAlias<XMM11>; + def YMM12: RegisterWithSubRegs<"ymm12", [XMM12]>, DwarfRegAlias<XMM12>; + def YMM13: RegisterWithSubRegs<"ymm13", [XMM13]>, DwarfRegAlias<XMM13>; + def YMM14: RegisterWithSubRegs<"ymm14", [XMM14]>, DwarfRegAlias<XMM14>; + def YMM15: RegisterWithSubRegs<"ymm15", [XMM15]>, DwarfRegAlias<XMM15>; } // Floating point stack registers @@ -326,104 +326,12 @@ def GR16 : RegisterClass<"X86", [i16], 16, [AX, CX, DX, SI, DI, BX, BP, SP, R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W]> { let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi)]; - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - static const unsigned X86_GR16_AO_64[] = { - X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, - X86::R8W, X86::R9W, X86::R10W, X86::R11W, - X86::BX, X86::R14W, X86::R15W, X86::R12W, X86::R13W, X86::BP - }; - - GR16Class::iterator - GR16Class::allocation_order_begin(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - if (Subtarget.is64Bit()) - return X86_GR16_AO_64; - else - return begin(); - } - - GR16Class::iterator - GR16Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - if (Subtarget.is64Bit()) { - // Does the function dedicate RBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate SP or BP. - return array_endof(X86_GR16_AO_64) - 1; - else - // If not, just don't allocate SP. - return array_endof(X86_GR16_AO_64); - } else { - // Does the function dedicate EBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate SP or BP. - return begin() + 6; - else - // If not, just don't allocate SP. - return begin() + 7; - } - } - }]; } def GR32 : RegisterClass<"X86", [i32], 32, [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP, R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D]> { let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)]; - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - static const unsigned X86_GR32_AO_64[] = { - X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, - X86::R8D, X86::R9D, X86::R10D, X86::R11D, - X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D, X86::EBP - }; - - GR32Class::iterator - GR32Class::allocation_order_begin(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - if (Subtarget.is64Bit()) - return X86_GR32_AO_64; - else - return begin(); - } - - GR32Class::iterator - GR32Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - if (Subtarget.is64Bit()) { - // Does the function dedicate RBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate ESP or EBP. - return array_endof(X86_GR32_AO_64) - 1; - else - // If not, just don't allocate ESP. - return array_endof(X86_GR32_AO_64); - } else { - // Does the function dedicate EBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate ESP or EBP. - return begin() + 6; - else - // If not, just don't allocate ESP. - return begin() + 7; - } - } - }]; } // GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since @@ -435,25 +343,6 @@ def GR64 : RegisterClass<"X86", [i64], 64, let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit), (GR32 sub_32bit)]; - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GR64Class::iterator - GR64Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - if (!Subtarget.is64Bit()) - return begin(); // None of these are allocatable in 32-bit. - // Does the function dedicate RBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - return end()-3; // If so, don't allocate RIP, RSP or RBP - else - return end()-2; // If not, just don't allocate RIP or RSP - } - }]; } // Segment registers for use by MOV instructions (and others) that have a @@ -496,7 +385,7 @@ def GR32_TC : RegisterClass<"X86", [i32], 32, [EAX, ECX, EDX]> { let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)]; } def GR64_TC : RegisterClass<"X86", [i64], 64, [RAX, RCX, RDX, RSI, RDI, - R8, R9, R11]> { + R8, R9, R11, RIP]> { let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit), (GR32_TC sub_32bit)]; @@ -543,48 +432,12 @@ def GR8_NOREX : RegisterClass<"X86", [i8], 8, def GR16_NOREX : RegisterClass<"X86", [i16], 16, [AX, CX, DX, SI, DI, BX, BP, SP]> { let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi)]; - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GR16_NOREXClass::iterator - GR16_NOREXClass::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - // Does the function dedicate RBP / EBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate SP or BP. - return end() - 2; - else - // If not, just don't allocate SP. - return end() - 1; - } - }]; } // GR32_NOREX - GR32 registers which do not require a REX prefix. def GR32_NOREX : RegisterClass<"X86", [i32], 32, [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP]> { let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi), (GR16_NOREX sub_16bit)]; - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GR32_NOREXClass::iterator - GR32_NOREXClass::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - // Does the function dedicate RBP / EBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate ESP or EBP. - return end() - 2; - else - // If not, just don't allocate ESP. - return end() - 1; - } - }]; } // GR64_NOREX - GR64 registers which do not require a REX prefix. def GR64_NOREX : RegisterClass<"X86", [i64], 64, @@ -592,24 +445,6 @@ def GR64_NOREX : RegisterClass<"X86", [i64], 64, let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi), (GR16_NOREX sub_16bit), (GR32_NOREX sub_32bit)]; - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GR64_NOREXClass::iterator - GR64_NOREXClass::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - // Does the function dedicate RBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate RIP, RSP or RBP. - return end() - 3; - else - // If not, just don't allocate RIP or RSP. - return end() - 2; - } - }]; } // GR32_NOSP - GR32 registers except ESP. @@ -617,52 +452,6 @@ def GR32_NOSP : RegisterClass<"X86", [i32], 32, [EAX, ECX, EDX, ESI, EDI, EBX, EBP, R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D]> { let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)]; - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - static const unsigned X86_GR32_NOSP_AO_64[] = { - X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, - X86::R8D, X86::R9D, X86::R10D, X86::R11D, - X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D, X86::EBP - }; - - GR32_NOSPClass::iterator - GR32_NOSPClass::allocation_order_begin(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - if (Subtarget.is64Bit()) - return X86_GR32_NOSP_AO_64; - else - return begin(); - } - - GR32_NOSPClass::iterator - GR32_NOSPClass::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - if (Subtarget.is64Bit()) { - // Does the function dedicate RBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate EBP. - return array_endof(X86_GR32_NOSP_AO_64) - 1; - else - // If not, any reg in this class is ok. - return array_endof(X86_GR32_NOSP_AO_64); - } else { - // Does the function dedicate EBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate EBP. - return begin() + 6; - else - // If not, any reg in this class is ok. - return begin() + 7; - } - } - }]; } // GR64_NOSP - GR64 registers except RSP (and RIP). @@ -672,25 +461,14 @@ def GR64_NOSP : RegisterClass<"X86", [i64], 64, let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit), (GR32_NOSP sub_32bit)]; - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GR64_NOSPClass::iterator - GR64_NOSPClass::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - if (!Subtarget.is64Bit()) - return begin(); // None of these are allocatable in 32-bit. - // Does the function dedicate RBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - return end()-1; // If so, don't allocate RBP - else - return end(); // If not, any reg in this class is ok. - } - }]; +} + +// GR32_NOREX_NOSP - GR32 registers which do not require a REX prefix except +// ESP. +def GR32_NOREX_NOSP : RegisterClass<"X86", [i32], 32, + [EAX, ECX, EDX, ESI, EDI, EBX, EBP]> { + let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi), + (GR16_NOREX sub_16bit)]; } // GR64_NOREX_NOSP - GR64_NOREX registers except RSP. @@ -698,26 +476,7 @@ def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64, [RAX, RCX, RDX, RSI, RDI, RBX, RBP]> { let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi), (GR16_NOREX sub_16bit), - (GR32_NOREX sub_32bit)]; - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GR64_NOREX_NOSPClass::iterator - GR64_NOREX_NOSPClass::allocation_order_end(const MachineFunction &MF) const - { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - // Does the function dedicate RBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate RBP. - return end() - 1; - else - // If not, any reg in this class is ok. - return end(); - } - }]; + (GR32_NOREX_NOSP sub_32bit)]; } // A class to support the 'A' assembler constraint: EAX then EDX. @@ -731,42 +490,12 @@ def GR32_AD : RegisterClass<"X86", [i32], 32, [EAX, EDX]> { def FR32 : RegisterClass<"X86", [f32], 32, [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, - XMM12, XMM13, XMM14, XMM15]> { - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - FR32Class::iterator - FR32Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - if (!Subtarget.is64Bit()) - return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode. - else - return end(); - } - }]; -} + XMM12, XMM13, XMM14, XMM15]>; def FR64 : RegisterClass<"X86", [f64], 64, [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, - XMM12, XMM13, XMM14, XMM15]> { - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - FR64Class::iterator - FR64Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - if (!Subtarget.is64Bit()) - return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode. - else - return end(); - } - }]; -} + XMM12, XMM13, XMM14, XMM15]>; // FIXME: This sets up the floating point register files as though they are f64 @@ -784,15 +513,7 @@ def RFP80 : RegisterClass<"X86",[f80], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>; // for transforming FPn allocations to STn registers) def RST : RegisterClass<"X86", [f80, f64, f32], 32, [ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7]> { - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - RSTClass::iterator - RSTClass::allocation_order_end(const MachineFunction &MF) const { - return begin(); - } - }]; + let isAllocatable = 0; } // Generic vector registers: VR64 and VR128. @@ -803,21 +524,6 @@ def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],128, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15]> { let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd)]; - - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - VR128Class::iterator - VR128Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - if (!Subtarget.is64Bit()) - return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode. - else - return end(); - } - }]; } def VR256 : RegisterClass<"X86", [v32i8, v8i32, v4i64, v8f32, v4f64], 256, @@ -825,35 +531,10 @@ def VR256 : RegisterClass<"X86", [v32i8, v8i32, v4i64, v8f32, v4f64], 256, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15]> { let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd), (VR128 sub_xmm)]; - - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - VR256Class::iterator - VR256Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - if (!Subtarget.is64Bit()) - return end()-8; // Only YMM0 to YMM7 are available in 32-bit mode. - else - return end(); - } - }]; } // Status flags registers. def CCR : RegisterClass<"X86", [i32], 32, [EFLAGS]> { let CopyCost = -1; // Don't allow copying of status registers. - - // EFLAGS is not allocatable. - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - CCRClass::iterator - CCRClass::allocation_order_end(const MachineFunction &MF) const { - return allocation_order_begin(MF); - } - }]; + let isAllocatable = 0; } diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index ba5864e..481e821 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -265,6 +265,7 @@ void X86Subtarget::AutoDetectSubtargetFeatures() { HasCLMUL = IsIntel && ((ECX >> 1) & 0x1); HasFMA3 = IsIntel && ((ECX >> 12) & 0x1); + HasPOPCNT = IsIntel && ((ECX >> 23) & 0x1); HasAES = IsIntel && ((ECX >> 25) & 0x1); if (IsIntel || IsAMD) { |