diff options
Diffstat (limited to 'contrib/llvm/lib/Target/X86')
88 files changed, 75397 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp new file mode 100644 index 0000000..68908ab --- /dev/null +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -0,0 +1,2518 @@ +//===-- X86AsmParser.cpp - Parse X86 assembly to MCInst instructions ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86BaseInfo.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { +struct X86Operand; + +static const char OpPrecedence[] = { + 0, // IC_PLUS + 0, // IC_MINUS + 1, // IC_MULTIPLY + 1, // IC_DIVIDE + 2, // IC_RPAREN + 3, // IC_LPAREN + 0, // IC_IMM + 0 // IC_REGISTER +}; + +class X86AsmParser : public MCTargetAsmParser { + MCSubtargetInfo &STI; + MCAsmParser &Parser; + ParseInstructionInfo *InstInfo; +private: + enum InfixCalculatorTok { + IC_PLUS = 0, + IC_MINUS, + IC_MULTIPLY, + IC_DIVIDE, + IC_RPAREN, + IC_LPAREN, + IC_IMM, + IC_REGISTER + }; + + class InfixCalculator { + typedef std::pair< InfixCalculatorTok, int64_t > ICToken; + SmallVector<InfixCalculatorTok, 4> InfixOperatorStack; + SmallVector<ICToken, 4> PostfixStack; + + public: + int64_t popOperand() { + assert (!PostfixStack.empty() && "Poped an empty stack!"); + ICToken Op = PostfixStack.pop_back_val(); + assert ((Op.first == IC_IMM || Op.first == IC_REGISTER) + && "Expected and immediate or register!"); + return Op.second; + } + void pushOperand(InfixCalculatorTok Op, int64_t Val = 0) { + assert ((Op == IC_IMM || Op == IC_REGISTER) && + "Unexpected operand!"); + PostfixStack.push_back(std::make_pair(Op, Val)); + } + + void popOperator() { InfixOperatorStack.pop_back_val(); } + void pushOperator(InfixCalculatorTok Op) { + // Push the new operator if the stack is empty. + if (InfixOperatorStack.empty()) { + InfixOperatorStack.push_back(Op); + return; + } + + // Push the new operator if it has a higher precedence than the operator + // on the top of the stack or the operator on the top of the stack is a + // left parentheses. + unsigned Idx = InfixOperatorStack.size() - 1; + InfixCalculatorTok StackOp = InfixOperatorStack[Idx]; + if (OpPrecedence[Op] > OpPrecedence[StackOp] || StackOp == IC_LPAREN) { + InfixOperatorStack.push_back(Op); + return; + } + + // The operator on the top of the stack has higher precedence than the + // new operator. + unsigned ParenCount = 0; + while (1) { + // Nothing to process. + if (InfixOperatorStack.empty()) + break; + + Idx = InfixOperatorStack.size() - 1; + StackOp = InfixOperatorStack[Idx]; + if (!(OpPrecedence[StackOp] >= OpPrecedence[Op] || ParenCount)) + break; + + // If we have an even parentheses count and we see a left parentheses, + // then stop processing. + if (!ParenCount && StackOp == IC_LPAREN) + break; + + if (StackOp == IC_RPAREN) { + ++ParenCount; + InfixOperatorStack.pop_back_val(); + } else if (StackOp == IC_LPAREN) { + --ParenCount; + InfixOperatorStack.pop_back_val(); + } else { + InfixOperatorStack.pop_back_val(); + PostfixStack.push_back(std::make_pair(StackOp, 0)); + } + } + // Push the new operator. + InfixOperatorStack.push_back(Op); + } + int64_t execute() { + // Push any remaining operators onto the postfix stack. + while (!InfixOperatorStack.empty()) { + InfixCalculatorTok StackOp = InfixOperatorStack.pop_back_val(); + if (StackOp != IC_LPAREN && StackOp != IC_RPAREN) + PostfixStack.push_back(std::make_pair(StackOp, 0)); + } + + if (PostfixStack.empty()) + return 0; + + SmallVector<ICToken, 16> OperandStack; + for (unsigned i = 0, e = PostfixStack.size(); i != e; ++i) { + ICToken Op = PostfixStack[i]; + if (Op.first == IC_IMM || Op.first == IC_REGISTER) { + OperandStack.push_back(Op); + } else { + assert (OperandStack.size() > 1 && "Too few operands."); + int64_t Val; + ICToken Op2 = OperandStack.pop_back_val(); + ICToken Op1 = OperandStack.pop_back_val(); + switch (Op.first) { + default: + report_fatal_error("Unexpected operator!"); + break; + case IC_PLUS: + Val = Op1.second + Op2.second; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; + case IC_MINUS: + Val = Op1.second - Op2.second; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; + case IC_MULTIPLY: + assert (Op1.first == IC_IMM && Op2.first == IC_IMM && + "Multiply operation with an immediate and a register!"); + Val = Op1.second * Op2.second; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; + case IC_DIVIDE: + assert (Op1.first == IC_IMM && Op2.first == IC_IMM && + "Divide operation with an immediate and a register!"); + assert (Op2.second != 0 && "Division by zero!"); + Val = Op1.second / Op2.second; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; + } + } + } + assert (OperandStack.size() == 1 && "Expected a single result."); + return OperandStack.pop_back_val().second; + } + }; + + enum IntelExprState { + IES_PLUS, + IES_MINUS, + IES_MULTIPLY, + IES_DIVIDE, + IES_LBRAC, + IES_RBRAC, + IES_LPAREN, + IES_RPAREN, + IES_REGISTER, + IES_INTEGER, + IES_IDENTIFIER, + IES_ERROR + }; + + class IntelExprStateMachine { + IntelExprState State, PrevState; + unsigned BaseReg, IndexReg, TmpReg, Scale; + int64_t Imm; + const MCExpr *Sym; + StringRef SymName; + bool StopOnLBrac, AddImmPrefix; + InfixCalculator IC; + InlineAsmIdentifierInfo Info; + public: + IntelExprStateMachine(int64_t imm, bool stoponlbrac, bool addimmprefix) : + State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0), + Scale(1), Imm(imm), Sym(0), StopOnLBrac(stoponlbrac), + AddImmPrefix(addimmprefix) { Info.clear(); } + + unsigned getBaseReg() { return BaseReg; } + unsigned getIndexReg() { return IndexReg; } + unsigned getScale() { return Scale; } + const MCExpr *getSym() { return Sym; } + StringRef getSymName() { return SymName; } + int64_t getImm() { return Imm + IC.execute(); } + bool isValidEndState() { return State == IES_RBRAC; } + bool getStopOnLBrac() { return StopOnLBrac; } + bool getAddImmPrefix() { return AddImmPrefix; } + bool hadError() { return State == IES_ERROR; } + + InlineAsmIdentifierInfo &getIdentifierInfo() { + return Info; + } + + void onPlus() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_RPAREN: + case IES_REGISTER: + State = IES_PLUS; + IC.pushOperator(IC_PLUS); + if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) { + // If we already have a BaseReg, then assume this is the IndexReg with + // a scale of 1. + if (!BaseReg) { + BaseReg = TmpReg; + } else { + assert (!IndexReg && "BaseReg/IndexReg already set!"); + IndexReg = TmpReg; + Scale = 1; + } + } + break; + } + PrevState = CurrState; + } + void onMinus() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_PLUS: + case IES_MULTIPLY: + case IES_DIVIDE: + case IES_LPAREN: + case IES_RPAREN: + case IES_LBRAC: + case IES_RBRAC: + case IES_INTEGER: + case IES_REGISTER: + State = IES_MINUS; + // Only push the minus operator if it is not a unary operator. + if (!(CurrState == IES_PLUS || CurrState == IES_MINUS || + CurrState == IES_MULTIPLY || CurrState == IES_DIVIDE || + CurrState == IES_LPAREN || CurrState == IES_LBRAC)) + IC.pushOperator(IC_MINUS); + if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) { + // If we already have a BaseReg, then assume this is the IndexReg with + // a scale of 1. + if (!BaseReg) { + BaseReg = TmpReg; + } else { + assert (!IndexReg && "BaseReg/IndexReg already set!"); + IndexReg = TmpReg; + Scale = 1; + } + } + break; + } + PrevState = CurrState; + } + void onRegister(unsigned Reg) { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_PLUS: + case IES_LPAREN: + State = IES_REGISTER; + TmpReg = Reg; + IC.pushOperand(IC_REGISTER); + break; + case IES_MULTIPLY: + // Index Register - Scale * Register + if (PrevState == IES_INTEGER) { + assert (!IndexReg && "IndexReg already set!"); + State = IES_REGISTER; + IndexReg = Reg; + // Get the scale and replace the 'Scale * Register' with '0'. + Scale = IC.popOperand(); + IC.pushOperand(IC_IMM); + IC.popOperator(); + } else { + State = IES_ERROR; + } + break; + } + PrevState = CurrState; + } + void onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName) { + PrevState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_PLUS: + case IES_MINUS: + State = IES_INTEGER; + Sym = SymRef; + SymName = SymRefName; + IC.pushOperand(IC_IMM); + break; + } + } + void onInteger(int64_t TmpInt) { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_PLUS: + case IES_MINUS: + case IES_DIVIDE: + case IES_MULTIPLY: + case IES_LPAREN: + State = IES_INTEGER; + if (PrevState == IES_REGISTER && CurrState == IES_MULTIPLY) { + // Index Register - Register * Scale + assert (!IndexReg && "IndexReg already set!"); + IndexReg = TmpReg; + Scale = TmpInt; + // Get the scale and replace the 'Register * Scale' with '0'. + IC.popOperator(); + } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS || + PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE || + PrevState == IES_LPAREN || PrevState == IES_LBRAC) && + CurrState == IES_MINUS) { + // Unary minus. No need to pop the minus operand because it was never + // pushed. + IC.pushOperand(IC_IMM, -TmpInt); // Push -Imm. + } else { + IC.pushOperand(IC_IMM, TmpInt); + } + break; + } + PrevState = CurrState; + } + void onStar() { + PrevState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_REGISTER: + case IES_RPAREN: + State = IES_MULTIPLY; + IC.pushOperator(IC_MULTIPLY); + break; + } + } + void onDivide() { + PrevState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_RPAREN: + State = IES_DIVIDE; + IC.pushOperator(IC_DIVIDE); + break; + } + } + void onLBrac() { + PrevState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_RBRAC: + State = IES_PLUS; + IC.pushOperator(IC_PLUS); + break; + } + } + void onRBrac() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_REGISTER: + case IES_RPAREN: + State = IES_RBRAC; + if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) { + // If we already have a BaseReg, then assume this is the IndexReg with + // a scale of 1. + if (!BaseReg) { + BaseReg = TmpReg; + } else { + assert (!IndexReg && "BaseReg/IndexReg already set!"); + IndexReg = TmpReg; + Scale = 1; + } + } + break; + } + PrevState = CurrState; + } + void onLParen() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_PLUS: + case IES_MINUS: + case IES_MULTIPLY: + case IES_DIVIDE: + case IES_LPAREN: + // FIXME: We don't handle this type of unary minus, yet. + if ((PrevState == IES_PLUS || PrevState == IES_MINUS || + PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE || + PrevState == IES_LPAREN || PrevState == IES_LBRAC) && + CurrState == IES_MINUS) { + State = IES_ERROR; + break; + } + State = IES_LPAREN; + IC.pushOperator(IC_LPAREN); + break; + } + PrevState = CurrState; + } + void onRParen() { + PrevState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_REGISTER: + case IES_RPAREN: + State = IES_RPAREN; + IC.pushOperator(IC_RPAREN); + break; + } + } + }; + + MCAsmParser &getParser() const { return Parser; } + + MCAsmLexer &getLexer() const { return Parser.getLexer(); } + + bool Error(SMLoc L, const Twine &Msg, + ArrayRef<SMRange> Ranges = None, + bool MatchingInlineAsm = false) { + if (MatchingInlineAsm) return true; + return Parser.Error(L, Msg, Ranges); + } + + X86Operand *ErrorOperand(SMLoc Loc, StringRef Msg) { + Error(Loc, Msg); + return 0; + } + + X86Operand *ParseOperand(); + X86Operand *ParseATTOperand(); + X86Operand *ParseIntelOperand(); + X86Operand *ParseIntelOffsetOfOperator(); + X86Operand *ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp); + X86Operand *ParseIntelOperator(unsigned OpKind); + X86Operand *ParseIntelMemOperand(unsigned SegReg, int64_t ImmDisp, + SMLoc StartLoc); + X86Operand *ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End); + X86Operand *ParseIntelBracExpression(unsigned SegReg, SMLoc Start, + int64_t ImmDisp, unsigned Size); + X86Operand *ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier, + InlineAsmIdentifierInfo &Info, + bool IsUnevaluatedOperand, SMLoc &End); + + X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc); + + X86Operand *CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, + unsigned BaseReg, unsigned IndexReg, + unsigned Scale, SMLoc Start, SMLoc End, + unsigned Size, StringRef Identifier, + InlineAsmIdentifierInfo &Info); + + bool ParseDirectiveWord(unsigned Size, SMLoc L); + bool ParseDirectiveCode(StringRef IDVal, SMLoc L); + + bool processInstruction(MCInst &Inst, + const SmallVectorImpl<MCParsedAsmOperand*> &Ops); + + bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + SmallVectorImpl<MCParsedAsmOperand*> &Operands, + MCStreamer &Out, unsigned &ErrorInfo, + bool MatchingInlineAsm); + + /// isSrcOp - Returns true if operand is either (%rsi) or %ds:%(rsi) + /// in 64bit mode or (%esi) or %es:(%esi) in 32bit mode. + bool isSrcOp(X86Operand &Op); + + /// isDstOp - Returns true if operand is either (%rdi) or %es:(%rdi) + /// in 64bit mode or (%edi) or %es:(%edi) in 32bit mode. + bool isDstOp(X86Operand &Op); + + bool is64BitMode() const { + // FIXME: Can tablegen auto-generate this? + return (STI.getFeatureBits() & X86::Mode64Bit) != 0; + } + void SwitchMode() { + unsigned FB = ComputeAvailableFeatures(STI.ToggleFeature(X86::Mode64Bit)); + setAvailableFeatures(FB); + } + + bool isParsingIntelSyntax() { + return getParser().getAssemblerDialect(); + } + + /// @name Auto-generated Matcher Functions + /// { + +#define GET_ASSEMBLER_HEADER +#include "X86GenAsmMatcher.inc" + + /// } + +public: + X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser) + : MCTargetAsmParser(), STI(sti), Parser(parser), InstInfo(0) { + + // Initialize the set of available features. + setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + } + virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc); + + virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, + SmallVectorImpl<MCParsedAsmOperand*> &Operands); + + virtual bool ParseDirective(AsmToken DirectiveID); +}; +} // end anonymous namespace + +/// @name Auto-generated Match Functions +/// { + +static unsigned MatchRegisterName(StringRef Name); + +/// } + +static bool isImmSExti16i8Value(uint64_t Value) { + return (( Value <= 0x000000000000007FULL)|| + (0x000000000000FF80ULL <= Value && Value <= 0x000000000000FFFFULL)|| + (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); +} + +static bool isImmSExti32i8Value(uint64_t Value) { + return (( Value <= 0x000000000000007FULL)|| + (0x00000000FFFFFF80ULL <= Value && Value <= 0x00000000FFFFFFFFULL)|| + (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); +} + +static bool isImmZExtu32u8Value(uint64_t Value) { + return (Value <= 0x00000000000000FFULL); +} + +static bool isImmSExti64i8Value(uint64_t Value) { + return (( Value <= 0x000000000000007FULL)|| + (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); +} + +static bool isImmSExti64i32Value(uint64_t Value) { + return (( Value <= 0x000000007FFFFFFFULL)|| + (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); +} +namespace { + +/// X86Operand - Instances of this class represent a parsed X86 machine +/// instruction. +struct X86Operand : public MCParsedAsmOperand { + enum KindTy { + Token, + Register, + Immediate, + Memory + } Kind; + + SMLoc StartLoc, EndLoc; + SMLoc OffsetOfLoc; + StringRef SymName; + void *OpDecl; + bool AddressOf; + + struct TokOp { + const char *Data; + unsigned Length; + }; + + struct RegOp { + unsigned RegNo; + }; + + struct ImmOp { + const MCExpr *Val; + }; + + struct MemOp { + unsigned SegReg; + const MCExpr *Disp; + unsigned BaseReg; + unsigned IndexReg; + unsigned Scale; + unsigned Size; + }; + + union { + struct TokOp Tok; + struct RegOp Reg; + struct ImmOp Imm; + struct MemOp Mem; + }; + + X86Operand(KindTy K, SMLoc Start, SMLoc End) + : Kind(K), StartLoc(Start), EndLoc(End) {} + + StringRef getSymName() { return SymName; } + void *getOpDecl() { return OpDecl; } + + /// getStartLoc - Get the location of the first token of this operand. + SMLoc getStartLoc() const { return StartLoc; } + /// getEndLoc - Get the location of the last token of this operand. + SMLoc getEndLoc() const { return EndLoc; } + /// getLocRange - Get the range between the first and last token of this + /// operand. + SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); } + /// getOffsetOfLoc - Get the location of the offset operator. + SMLoc getOffsetOfLoc() const { return OffsetOfLoc; } + + virtual void print(raw_ostream &OS) const {} + + StringRef getToken() const { + assert(Kind == Token && "Invalid access!"); + return StringRef(Tok.Data, Tok.Length); + } + void setTokenValue(StringRef Value) { + assert(Kind == Token && "Invalid access!"); + Tok.Data = Value.data(); + Tok.Length = Value.size(); + } + + unsigned getReg() const { + assert(Kind == Register && "Invalid access!"); + return Reg.RegNo; + } + + const MCExpr *getImm() const { + assert(Kind == Immediate && "Invalid access!"); + return Imm.Val; + } + + const MCExpr *getMemDisp() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.Disp; + } + unsigned getMemSegReg() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.SegReg; + } + unsigned getMemBaseReg() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.BaseReg; + } + unsigned getMemIndexReg() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.IndexReg; + } + unsigned getMemScale() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.Scale; + } + + bool isToken() const {return Kind == Token; } + + bool isImm() const { return Kind == Immediate; } + + bool isImmSExti16i8() const { + if (!isImm()) + return false; + + // If this isn't a constant expr, just assume it fits and let relaxation + // handle it. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) + return true; + + // Otherwise, check the value is in a range that makes sense for this + // extension. + return isImmSExti16i8Value(CE->getValue()); + } + bool isImmSExti32i8() const { + if (!isImm()) + return false; + + // If this isn't a constant expr, just assume it fits and let relaxation + // handle it. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) + return true; + + // Otherwise, check the value is in a range that makes sense for this + // extension. + return isImmSExti32i8Value(CE->getValue()); + } + bool isImmZExtu32u8() const { + if (!isImm()) + return false; + + // If this isn't a constant expr, just assume it fits and let relaxation + // handle it. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) + return true; + + // Otherwise, check the value is in a range that makes sense for this + // extension. + return isImmZExtu32u8Value(CE->getValue()); + } + bool isImmSExti64i8() const { + if (!isImm()) + return false; + + // If this isn't a constant expr, just assume it fits and let relaxation + // handle it. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) + return true; + + // Otherwise, check the value is in a range that makes sense for this + // extension. + return isImmSExti64i8Value(CE->getValue()); + } + bool isImmSExti64i32() const { + if (!isImm()) + return false; + + // If this isn't a constant expr, just assume it fits and let relaxation + // handle it. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) + return true; + + // Otherwise, check the value is in a range that makes sense for this + // extension. + return isImmSExti64i32Value(CE->getValue()); + } + + bool isOffsetOf() const { + return OffsetOfLoc.getPointer(); + } + + bool needAddressOf() const { + return AddressOf; + } + + bool isMem() const { return Kind == Memory; } + bool isMem8() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 8); + } + bool isMem16() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 16); + } + bool isMem32() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 32); + } + bool isMem64() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 64); + } + bool isMem80() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 80); + } + bool isMem128() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 128); + } + bool isMem256() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 256); + } + + bool isMemVX32() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 32) && + getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15; + } + bool isMemVY32() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 32) && + getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15; + } + bool isMemVX64() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 64) && + getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15; + } + bool isMemVY64() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 64) && + getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15; + } + + bool isAbsMem() const { + return Kind == Memory && !getMemSegReg() && !getMemBaseReg() && + !getMemIndexReg() && getMemScale() == 1; + } + + bool isReg() const { return Kind == Register; } + + void addExpr(MCInst &Inst, const MCExpr *Expr) const { + // Add as immediates when possible. + if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr)) + Inst.addOperand(MCOperand::CreateImm(CE->getValue())); + else + Inst.addOperand(MCOperand::CreateExpr(Expr)); + } + + void addRegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(getReg())); + } + + void addImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + addExpr(Inst, getImm()); + } + + void addMem8Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMem16Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMem32Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMem64Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMem80Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMem128Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMem256Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMemVX32Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMemVY32Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMemVX64Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMemVY64Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + + void addMemOperands(MCInst &Inst, unsigned N) const { + assert((N == 5) && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(getMemBaseReg())); + Inst.addOperand(MCOperand::CreateImm(getMemScale())); + Inst.addOperand(MCOperand::CreateReg(getMemIndexReg())); + addExpr(Inst, getMemDisp()); + Inst.addOperand(MCOperand::CreateReg(getMemSegReg())); + } + + void addAbsMemOperands(MCInst &Inst, unsigned N) const { + assert((N == 1) && "Invalid number of operands!"); + // Add as immediates when possible. + if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp())) + Inst.addOperand(MCOperand::CreateImm(CE->getValue())); + else + Inst.addOperand(MCOperand::CreateExpr(getMemDisp())); + } + + static X86Operand *CreateToken(StringRef Str, SMLoc Loc) { + SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size()); + X86Operand *Res = new X86Operand(Token, Loc, EndLoc); + Res->Tok.Data = Str.data(); + Res->Tok.Length = Str.size(); + return Res; + } + + static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc, + bool AddressOf = false, + SMLoc OffsetOfLoc = SMLoc(), + StringRef SymName = StringRef(), + void *OpDecl = 0) { + X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc); + Res->Reg.RegNo = RegNo; + Res->AddressOf = AddressOf; + Res->OffsetOfLoc = OffsetOfLoc; + Res->SymName = SymName; + Res->OpDecl = OpDecl; + return Res; + } + + static X86Operand *CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc){ + X86Operand *Res = new X86Operand(Immediate, StartLoc, EndLoc); + Res->Imm.Val = Val; + return Res; + } + + /// Create an absolute memory operand. + static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc, + unsigned Size = 0, StringRef SymName = StringRef(), + void *OpDecl = 0) { + X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc); + Res->Mem.SegReg = 0; + Res->Mem.Disp = Disp; + Res->Mem.BaseReg = 0; + Res->Mem.IndexReg = 0; + Res->Mem.Scale = 1; + Res->Mem.Size = Size; + Res->SymName = SymName; + Res->OpDecl = OpDecl; + Res->AddressOf = false; + return Res; + } + + /// Create a generalized memory operand. + static X86Operand *CreateMem(unsigned SegReg, const MCExpr *Disp, + unsigned BaseReg, unsigned IndexReg, + unsigned Scale, SMLoc StartLoc, SMLoc EndLoc, + unsigned Size = 0, + StringRef SymName = StringRef(), + void *OpDecl = 0) { + // We should never just have a displacement, that should be parsed as an + // absolute memory operand. + assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!"); + + // The scale should always be one of {1,2,4,8}. + assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) && + "Invalid scale!"); + X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc); + Res->Mem.SegReg = SegReg; + Res->Mem.Disp = Disp; + Res->Mem.BaseReg = BaseReg; + Res->Mem.IndexReg = IndexReg; + Res->Mem.Scale = Scale; + Res->Mem.Size = Size; + Res->SymName = SymName; + Res->OpDecl = OpDecl; + Res->AddressOf = false; + return Res; + } +}; + +} // end anonymous namespace. + +bool X86AsmParser::isSrcOp(X86Operand &Op) { + unsigned basereg = is64BitMode() ? X86::RSI : X86::ESI; + + return (Op.isMem() && + (Op.Mem.SegReg == 0 || Op.Mem.SegReg == X86::DS) && + isa<MCConstantExpr>(Op.Mem.Disp) && + cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 && + Op.Mem.BaseReg == basereg && Op.Mem.IndexReg == 0); +} + +bool X86AsmParser::isDstOp(X86Operand &Op) { + unsigned basereg = is64BitMode() ? X86::RDI : X86::EDI; + + return Op.isMem() && + (Op.Mem.SegReg == 0 || Op.Mem.SegReg == X86::ES) && + isa<MCConstantExpr>(Op.Mem.Disp) && + cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 && + Op.Mem.BaseReg == basereg && Op.Mem.IndexReg == 0; +} + +bool X86AsmParser::ParseRegister(unsigned &RegNo, + SMLoc &StartLoc, SMLoc &EndLoc) { + RegNo = 0; + const AsmToken &PercentTok = Parser.getTok(); + StartLoc = PercentTok.getLoc(); + + // If we encounter a %, ignore it. This code handles registers with and + // without the prefix, unprefixed registers can occur in cfi directives. + if (!isParsingIntelSyntax() && PercentTok.is(AsmToken::Percent)) + Parser.Lex(); // Eat percent token. + + const AsmToken &Tok = Parser.getTok(); + EndLoc = Tok.getEndLoc(); + + if (Tok.isNot(AsmToken::Identifier)) { + if (isParsingIntelSyntax()) return true; + return Error(StartLoc, "invalid register name", + SMRange(StartLoc, EndLoc)); + } + + RegNo = MatchRegisterName(Tok.getString()); + + // If the match failed, try the register name as lowercase. + if (RegNo == 0) + RegNo = MatchRegisterName(Tok.getString().lower()); + + if (!is64BitMode()) { + // FIXME: This should be done using Requires<In32BitMode> and + // Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also + // checked. + // FIXME: Check AH, CH, DH, BH cannot be used in an instruction requiring a + // REX prefix. + if (RegNo == X86::RIZ || + X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) || + X86II::isX86_64NonExtLowByteReg(RegNo) || + X86II::isX86_64ExtendedReg(RegNo)) + return Error(StartLoc, "register %" + + Tok.getString() + " is only available in 64-bit mode", + SMRange(StartLoc, EndLoc)); + } + + // Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens. + if (RegNo == 0 && (Tok.getString() == "st" || Tok.getString() == "ST")) { + RegNo = X86::ST0; + Parser.Lex(); // Eat 'st' + + // Check to see if we have '(4)' after %st. + if (getLexer().isNot(AsmToken::LParen)) + return false; + // Lex the paren. + getParser().Lex(); + + const AsmToken &IntTok = Parser.getTok(); + if (IntTok.isNot(AsmToken::Integer)) + return Error(IntTok.getLoc(), "expected stack index"); + switch (IntTok.getIntVal()) { + case 0: RegNo = X86::ST0; break; + case 1: RegNo = X86::ST1; break; + case 2: RegNo = X86::ST2; break; + case 3: RegNo = X86::ST3; break; + case 4: RegNo = X86::ST4; break; + case 5: RegNo = X86::ST5; break; + case 6: RegNo = X86::ST6; break; + case 7: RegNo = X86::ST7; break; + default: return Error(IntTok.getLoc(), "invalid stack index"); + } + + if (getParser().Lex().isNot(AsmToken::RParen)) + return Error(Parser.getTok().getLoc(), "expected ')'"); + + EndLoc = Parser.getTok().getEndLoc(); + Parser.Lex(); // Eat ')' + return false; + } + + EndLoc = Parser.getTok().getEndLoc(); + + // If this is "db[0-7]", match it as an alias + // for dr[0-7]. + if (RegNo == 0 && Tok.getString().size() == 3 && + Tok.getString().startswith("db")) { + switch (Tok.getString()[2]) { + case '0': RegNo = X86::DR0; break; + case '1': RegNo = X86::DR1; break; + case '2': RegNo = X86::DR2; break; + case '3': RegNo = X86::DR3; break; + case '4': RegNo = X86::DR4; break; + case '5': RegNo = X86::DR5; break; + case '6': RegNo = X86::DR6; break; + case '7': RegNo = X86::DR7; break; + } + + if (RegNo != 0) { + EndLoc = Parser.getTok().getEndLoc(); + Parser.Lex(); // Eat it. + return false; + } + } + + if (RegNo == 0) { + if (isParsingIntelSyntax()) return true; + return Error(StartLoc, "invalid register name", + SMRange(StartLoc, EndLoc)); + } + + Parser.Lex(); // Eat identifier token. + return false; +} + +X86Operand *X86AsmParser::ParseOperand() { + if (isParsingIntelSyntax()) + return ParseIntelOperand(); + return ParseATTOperand(); +} + +/// getIntelMemOperandSize - Return intel memory operand size. +static unsigned getIntelMemOperandSize(StringRef OpStr) { + unsigned Size = StringSwitch<unsigned>(OpStr) + .Cases("BYTE", "byte", 8) + .Cases("WORD", "word", 16) + .Cases("DWORD", "dword", 32) + .Cases("QWORD", "qword", 64) + .Cases("XWORD", "xword", 80) + .Cases("XMMWORD", "xmmword", 128) + .Cases("YMMWORD", "ymmword", 256) + .Default(0); + return Size; +} + +X86Operand * +X86AsmParser::CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, + unsigned BaseReg, unsigned IndexReg, + unsigned Scale, SMLoc Start, SMLoc End, + unsigned Size, StringRef Identifier, + InlineAsmIdentifierInfo &Info){ + if (isa<MCSymbolRefExpr>(Disp)) { + // If this is not a VarDecl then assume it is a FuncDecl or some other label + // reference. We need an 'r' constraint here, so we need to create register + // operand to ensure proper matching. Just pick a GPR based on the size of + // a pointer. + if (!Info.IsVarDecl) { + unsigned RegNo = is64BitMode() ? X86::RBX : X86::EBX; + return X86Operand::CreateReg(RegNo, Start, End, /*AddressOf=*/true, + SMLoc(), Identifier, Info.OpDecl); + } + if (!Size) { + Size = Info.Type * 8; // Size is in terms of bits in this context. + if (Size) + InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_SizeDirective, Start, + /*Len=*/0, Size)); + } + } + + // When parsing inline assembly we set the base register to a non-zero value + // if we don't know the actual value at this time. This is necessary to + // get the matching correct in some cases. + BaseReg = BaseReg ? BaseReg : 1; + return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, Start, + End, Size, Identifier, Info.OpDecl); +} + +static void +RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites, + StringRef SymName, int64_t ImmDisp, + int64_t FinalImmDisp, SMLoc &BracLoc, + SMLoc &StartInBrac, SMLoc &End) { + // Remove the '[' and ']' from the IR string. + AsmRewrites->push_back(AsmRewrite(AOK_Skip, BracLoc, 1)); + AsmRewrites->push_back(AsmRewrite(AOK_Skip, End, 1)); + + // If ImmDisp is non-zero, then we parsed a displacement before the + // bracketed expression (i.e., ImmDisp [ BaseReg + Scale*IndexReg + Disp]) + // If ImmDisp doesn't match the displacement computed by the state machine + // then we have an additional displacement in the bracketed expression. + if (ImmDisp != FinalImmDisp) { + if (ImmDisp) { + // We have an immediate displacement before the bracketed expression. + // Adjust this to match the final immediate displacement. + bool Found = false; + for (SmallVectorImpl<AsmRewrite>::iterator I = AsmRewrites->begin(), + E = AsmRewrites->end(); I != E; ++I) { + if ((*I).Loc.getPointer() > BracLoc.getPointer()) + continue; + if ((*I).Kind == AOK_ImmPrefix || (*I).Kind == AOK_Imm) { + assert (!Found && "ImmDisp already rewritten."); + (*I).Kind = AOK_Imm; + (*I).Len = BracLoc.getPointer() - (*I).Loc.getPointer(); + (*I).Val = FinalImmDisp; + Found = true; + break; + } + } + assert (Found && "Unable to rewrite ImmDisp."); + } else { + // We have a symbolic and an immediate displacement, but no displacement + // before the bracketed expression. Put the immediate displacement + // before the bracketed expression. + AsmRewrites->push_back(AsmRewrite(AOK_Imm, BracLoc, 0, FinalImmDisp)); + } + } + // Remove all the ImmPrefix rewrites within the brackets. + for (SmallVectorImpl<AsmRewrite>::iterator I = AsmRewrites->begin(), + E = AsmRewrites->end(); I != E; ++I) { + if ((*I).Loc.getPointer() < StartInBrac.getPointer()) + continue; + if ((*I).Kind == AOK_ImmPrefix) + (*I).Kind = AOK_Delete; + } + const char *SymLocPtr = SymName.data(); + // Skip everything before the symbol. + if (unsigned Len = SymLocPtr - StartInBrac.getPointer()) { + assert(Len > 0 && "Expected a non-negative length."); + AsmRewrites->push_back(AsmRewrite(AOK_Skip, StartInBrac, Len)); + } + // Skip everything after the symbol. + if (unsigned Len = End.getPointer() - (SymLocPtr + SymName.size())) { + SMLoc Loc = SMLoc::getFromPointer(SymLocPtr + SymName.size()); + assert(Len > 0 && "Expected a non-negative length."); + AsmRewrites->push_back(AsmRewrite(AOK_Skip, Loc, Len)); + } +} + +X86Operand * +X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { + const AsmToken &Tok = Parser.getTok(); + + bool Done = false; + while (!Done) { + bool UpdateLocLex = true; + + // The period in the dot operator (e.g., [ebx].foo.bar) is parsed as an + // identifier. Don't try an parse it as a register. + if (Tok.getString().startswith(".")) + break; + + // If we're parsing an immediate expression, we don't expect a '['. + if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac) + break; + + switch (getLexer().getKind()) { + default: { + if (SM.isValidEndState()) { + Done = true; + break; + } + return ErrorOperand(Tok.getLoc(), "Unexpected token!"); + } + case AsmToken::EndOfStatement: { + Done = true; + break; + } + case AsmToken::Identifier: { + // This could be a register or a symbolic displacement. + unsigned TmpReg; + const MCExpr *Val; + SMLoc IdentLoc = Tok.getLoc(); + StringRef Identifier = Tok.getString(); + if(!ParseRegister(TmpReg, IdentLoc, End)) { + SM.onRegister(TmpReg); + UpdateLocLex = false; + break; + } else { + if (!isParsingInlineAsm()) { + if (getParser().parsePrimaryExpr(Val, End)) + return ErrorOperand(Tok.getLoc(), "Unexpected identifier!"); + } else { + InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo(); + if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info, + /*Unevaluated*/ false, End)) + return Err; + } + SM.onIdentifierExpr(Val, Identifier); + UpdateLocLex = false; + break; + } + return ErrorOperand(Tok.getLoc(), "Unexpected identifier!"); + } + case AsmToken::Integer: + if (isParsingInlineAsm() && SM.getAddImmPrefix()) + InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix, + Tok.getLoc())); + SM.onInteger(Tok.getIntVal()); + break; + case AsmToken::Plus: SM.onPlus(); break; + case AsmToken::Minus: SM.onMinus(); break; + case AsmToken::Star: SM.onStar(); break; + case AsmToken::Slash: SM.onDivide(); break; + case AsmToken::LBrac: SM.onLBrac(); break; + case AsmToken::RBrac: SM.onRBrac(); break; + case AsmToken::LParen: SM.onLParen(); break; + case AsmToken::RParen: SM.onRParen(); break; + } + if (SM.hadError()) + return ErrorOperand(Tok.getLoc(), "Unexpected token!"); + + if (!Done && UpdateLocLex) { + End = Tok.getLoc(); + Parser.Lex(); // Consume the token. + } + } + return 0; +} + +X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, + int64_t ImmDisp, + unsigned Size) { + const AsmToken &Tok = Parser.getTok(); + SMLoc BracLoc = Tok.getLoc(), End = Tok.getEndLoc(); + if (getLexer().isNot(AsmToken::LBrac)) + return ErrorOperand(BracLoc, "Expected '[' token!"); + Parser.Lex(); // Eat '[' + + SMLoc StartInBrac = Tok.getLoc(); + // Parse [ Symbol + ImmDisp ] and [ BaseReg + Scale*IndexReg + ImmDisp ]. We + // may have already parsed an immediate displacement before the bracketed + // expression. + IntelExprStateMachine SM(ImmDisp, /*StopOnLBrac=*/false, /*AddImmPrefix=*/true); + if (X86Operand *Err = ParseIntelExpression(SM, End)) + return Err; + + const MCExpr *Disp; + if (const MCExpr *Sym = SM.getSym()) { + // A symbolic displacement. + Disp = Sym; + if (isParsingInlineAsm()) + RewriteIntelBracExpression(InstInfo->AsmRewrites, SM.getSymName(), + ImmDisp, SM.getImm(), BracLoc, StartInBrac, + End); + } else { + // An immediate displacement only. + Disp = MCConstantExpr::Create(SM.getImm(), getContext()); + } + + // Parse the dot operator (e.g., [ebx].foo.bar). + if (Tok.getString().startswith(".")) { + const MCExpr *NewDisp; + if (X86Operand *Err = ParseIntelDotOperator(Disp, NewDisp)) + return Err; + + End = Tok.getEndLoc(); + Parser.Lex(); // Eat the field. + Disp = NewDisp; + } + + int BaseReg = SM.getBaseReg(); + int IndexReg = SM.getIndexReg(); + int Scale = SM.getScale(); + if (!isParsingInlineAsm()) { + // handle [-42] + if (!BaseReg && !IndexReg) { + if (!SegReg) + return X86Operand::CreateMem(Disp, Start, End, Size); + else + return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, Start, End, Size); + } + return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, Start, + End, Size); + } + + InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo(); + return CreateMemForInlineAsm(SegReg, Disp, BaseReg, IndexReg, Scale, Start, + End, Size, SM.getSymName(), Info); +} + +// Inline assembly may use variable names with namespace alias qualifiers. +X86Operand *X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, + StringRef &Identifier, + InlineAsmIdentifierInfo &Info, + bool IsUnevaluatedOperand, + SMLoc &End) { + assert (isParsingInlineAsm() && "Expected to be parsing inline assembly."); + Val = 0; + + StringRef LineBuf(Identifier.data()); + SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand); + + const AsmToken &Tok = Parser.getTok(); + + // Advance the token stream until the end of the current token is + // after the end of what the frontend claimed. + const char *EndPtr = Tok.getLoc().getPointer() + LineBuf.size(); + while (true) { + End = Tok.getEndLoc(); + getLexer().Lex(); + + assert(End.getPointer() <= EndPtr && "frontend claimed part of a token?"); + if (End.getPointer() == EndPtr) break; + } + + // Create the symbol reference. + Identifier = LineBuf; + MCSymbol *Sym = getContext().GetOrCreateSymbol(Identifier); + MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None; + Val = MCSymbolRefExpr::Create(Sym, Variant, getParser().getContext()); + return 0; +} + +/// ParseIntelMemOperand - Parse intel style memory operand. +X86Operand *X86AsmParser::ParseIntelMemOperand(unsigned SegReg, + int64_t ImmDisp, + SMLoc Start) { + const AsmToken &Tok = Parser.getTok(); + SMLoc End; + + unsigned Size = getIntelMemOperandSize(Tok.getString()); + if (Size) { + Parser.Lex(); // Eat operand size (e.g., byte, word). + if (Tok.getString() != "PTR" && Tok.getString() != "ptr") + return ErrorOperand(Start, "Expected 'PTR' or 'ptr' token!"); + Parser.Lex(); // Eat ptr. + } + + // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ]. + if (getLexer().is(AsmToken::Integer)) { + if (isParsingInlineAsm()) + InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix, + Tok.getLoc())); + int64_t ImmDisp = Tok.getIntVal(); + Parser.Lex(); // Eat the integer. + if (getLexer().isNot(AsmToken::LBrac)) + return ErrorOperand(Start, "Expected '[' token!"); + return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size); + } + + if (getLexer().is(AsmToken::LBrac)) + return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size); + + if (!ParseRegister(SegReg, Start, End)) { + // Handel SegReg : [ ... ] + if (getLexer().isNot(AsmToken::Colon)) + return ErrorOperand(Start, "Expected ':' token!"); + Parser.Lex(); // Eat : + if (getLexer().isNot(AsmToken::LBrac)) + return ErrorOperand(Start, "Expected '[' token!"); + return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size); + } + + const MCExpr *Val; + if (!isParsingInlineAsm()) { + if (getParser().parsePrimaryExpr(Val, End)) + return ErrorOperand(Tok.getLoc(), "Unexpected token!"); + + return X86Operand::CreateMem(Val, Start, End, Size); + } + + InlineAsmIdentifierInfo Info; + StringRef Identifier = Tok.getString(); + if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info, + /*Unevaluated*/ false, End)) + return Err; + return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0, + /*Scale=*/1, Start, End, Size, Identifier, Info); +} + +/// Parse the '.' operator. +X86Operand *X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, + const MCExpr *&NewDisp) { + const AsmToken &Tok = Parser.getTok(); + int64_t OrigDispVal, DotDispVal; + + // FIXME: Handle non-constant expressions. + if (const MCConstantExpr *OrigDisp = dyn_cast<MCConstantExpr>(Disp)) + OrigDispVal = OrigDisp->getValue(); + else + return ErrorOperand(Tok.getLoc(), "Non-constant offsets are not supported!"); + + // Drop the '.'. + StringRef DotDispStr = Tok.getString().drop_front(1); + + // .Imm gets lexed as a real. + if (Tok.is(AsmToken::Real)) { + APInt DotDisp; + DotDispStr.getAsInteger(10, DotDisp); + DotDispVal = DotDisp.getZExtValue(); + } else if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) { + unsigned DotDisp; + std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.'); + if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second, + DotDisp)) + return ErrorOperand(Tok.getLoc(), "Unable to lookup field reference!"); + DotDispVal = DotDisp; + } else + return ErrorOperand(Tok.getLoc(), "Unexpected token type!"); + + if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) { + SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data()); + unsigned Len = DotDispStr.size(); + unsigned Val = OrigDispVal + DotDispVal; + InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_DotOperator, Loc, Len, + Val)); + } + + NewDisp = MCConstantExpr::Create(OrigDispVal + DotDispVal, getContext()); + return 0; +} + +/// Parse the 'offset' operator. This operator is used to specify the +/// location rather then the content of a variable. +X86Operand *X86AsmParser::ParseIntelOffsetOfOperator() { + const AsmToken &Tok = Parser.getTok(); + SMLoc OffsetOfLoc = Tok.getLoc(); + Parser.Lex(); // Eat offset. + + const MCExpr *Val; + InlineAsmIdentifierInfo Info; + SMLoc Start = Tok.getLoc(), End; + StringRef Identifier = Tok.getString(); + if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info, + /*Unevaluated*/ false, End)) + return Err; + + // Don't emit the offset operator. + InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Skip, OffsetOfLoc, 7)); + + // The offset operator will have an 'r' constraint, thus we need to create + // register operand to ensure proper matching. Just pick a GPR based on + // the size of a pointer. + unsigned RegNo = is64BitMode() ? X86::RBX : X86::EBX; + return X86Operand::CreateReg(RegNo, Start, End, /*GetAddress=*/true, + OffsetOfLoc, Identifier, Info.OpDecl); +} + +enum IntelOperatorKind { + IOK_LENGTH, + IOK_SIZE, + IOK_TYPE +}; + +/// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators. The LENGTH operator +/// returns the number of elements in an array. It returns the value 1 for +/// non-array variables. The SIZE operator returns the size of a C or C++ +/// variable. A variable's size is the product of its LENGTH and TYPE. The +/// TYPE operator returns the size of a C or C++ type or variable. If the +/// variable is an array, TYPE returns the size of a single element. +X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) { + const AsmToken &Tok = Parser.getTok(); + SMLoc TypeLoc = Tok.getLoc(); + Parser.Lex(); // Eat operator. + + const MCExpr *Val = 0; + InlineAsmIdentifierInfo Info; + SMLoc Start = Tok.getLoc(), End; + StringRef Identifier = Tok.getString(); + if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info, + /*Unevaluated*/ true, End)) + return Err; + + unsigned CVal = 0; + switch(OpKind) { + default: llvm_unreachable("Unexpected operand kind!"); + case IOK_LENGTH: CVal = Info.Length; break; + case IOK_SIZE: CVal = Info.Size; break; + case IOK_TYPE: CVal = Info.Type; break; + } + + // Rewrite the type operator and the C or C++ type or variable in terms of an + // immediate. E.g. TYPE foo -> $$4 + unsigned Len = End.getPointer() - TypeLoc.getPointer(); + InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, TypeLoc, Len, CVal)); + + const MCExpr *Imm = MCConstantExpr::Create(CVal, getContext()); + return X86Operand::CreateImm(Imm, Start, End); +} + +X86Operand *X86AsmParser::ParseIntelOperand() { + const AsmToken &Tok = Parser.getTok(); + SMLoc Start = Tok.getLoc(), End; + + // Offset, length, type and size operators. + if (isParsingInlineAsm()) { + StringRef AsmTokStr = Tok.getString(); + if (AsmTokStr == "offset" || AsmTokStr == "OFFSET") + return ParseIntelOffsetOfOperator(); + if (AsmTokStr == "length" || AsmTokStr == "LENGTH") + return ParseIntelOperator(IOK_LENGTH); + if (AsmTokStr == "size" || AsmTokStr == "SIZE") + return ParseIntelOperator(IOK_SIZE); + if (AsmTokStr == "type" || AsmTokStr == "TYPE") + return ParseIntelOperator(IOK_TYPE); + } + + // Immediate. + if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Minus) || + getLexer().is(AsmToken::LParen)) { + AsmToken StartTok = Tok; + IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true, + /*AddImmPrefix=*/false); + if (X86Operand *Err = ParseIntelExpression(SM, End)) + return Err; + + int64_t Imm = SM.getImm(); + if (isParsingInlineAsm()) { + unsigned Len = Tok.getLoc().getPointer() - Start.getPointer(); + if (StartTok.getString().size() == Len) + // Just add a prefix if this wasn't a complex immediate expression. + InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix, Start)); + else + // Otherwise, rewrite the complex expression as a single immediate. + InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, Start, Len, Imm)); + } + + if (getLexer().isNot(AsmToken::LBrac)) { + const MCExpr *ImmExpr = MCConstantExpr::Create(Imm, getContext()); + return X86Operand::CreateImm(ImmExpr, Start, End); + } + + // Only positive immediates are valid. + if (Imm < 0) + return ErrorOperand(Start, "expected a positive immediate displacement " + "before bracketed expr."); + + // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ]. + return ParseIntelMemOperand(/*SegReg=*/0, Imm, Start); + } + + // Register. + unsigned RegNo = 0; + if (!ParseRegister(RegNo, Start, End)) { + // If this is a segment register followed by a ':', then this is the start + // of a memory reference, otherwise this is a normal register reference. + if (getLexer().isNot(AsmToken::Colon)) + return X86Operand::CreateReg(RegNo, Start, End); + + getParser().Lex(); // Eat the colon. + return ParseIntelMemOperand(/*SegReg=*/RegNo, /*Disp=*/0, Start); + } + + // Memory operand. + return ParseIntelMemOperand(/*SegReg=*/0, /*Disp=*/0, Start); +} + +X86Operand *X86AsmParser::ParseATTOperand() { + switch (getLexer().getKind()) { + default: + // Parse a memory operand with no segment register. + return ParseMemOperand(0, Parser.getTok().getLoc()); + case AsmToken::Percent: { + // Read the register. + unsigned RegNo; + SMLoc Start, End; + if (ParseRegister(RegNo, Start, End)) return 0; + if (RegNo == X86::EIZ || RegNo == X86::RIZ) { + Error(Start, "%eiz and %riz can only be used as index registers", + SMRange(Start, End)); + return 0; + } + + // If this is a segment register followed by a ':', then this is the start + // of a memory reference, otherwise this is a normal register reference. + if (getLexer().isNot(AsmToken::Colon)) + return X86Operand::CreateReg(RegNo, Start, End); + + getParser().Lex(); // Eat the colon. + return ParseMemOperand(RegNo, Start); + } + case AsmToken::Dollar: { + // $42 -> immediate. + SMLoc Start = Parser.getTok().getLoc(), End; + Parser.Lex(); + const MCExpr *Val; + if (getParser().parseExpression(Val, End)) + return 0; + return X86Operand::CreateImm(Val, Start, End); + } + } +} + +/// ParseMemOperand: segment: disp(basereg, indexreg, scale). The '%ds:' prefix +/// has already been parsed if present. +X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { + + // We have to disambiguate a parenthesized expression "(4+5)" from the start + // of a memory operand with a missing displacement "(%ebx)" or "(,%eax)". The + // only way to do this without lookahead is to eat the '(' and see what is + // after it. + const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext()); + if (getLexer().isNot(AsmToken::LParen)) { + SMLoc ExprEnd; + if (getParser().parseExpression(Disp, ExprEnd)) return 0; + + // After parsing the base expression we could either have a parenthesized + // memory address or not. If not, return now. If so, eat the (. + if (getLexer().isNot(AsmToken::LParen)) { + // Unless we have a segment register, treat this as an immediate. + if (SegReg == 0) + return X86Operand::CreateMem(Disp, MemStart, ExprEnd); + return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, MemStart, ExprEnd); + } + + // Eat the '('. + Parser.Lex(); + } else { + // Okay, we have a '('. We don't know if this is an expression or not, but + // so we have to eat the ( to see beyond it. + SMLoc LParenLoc = Parser.getTok().getLoc(); + Parser.Lex(); // Eat the '('. + + if (getLexer().is(AsmToken::Percent) || getLexer().is(AsmToken::Comma)) { + // Nothing to do here, fall into the code below with the '(' part of the + // memory operand consumed. + } else { + SMLoc ExprEnd; + + // It must be an parenthesized expression, parse it now. + if (getParser().parseParenExpression(Disp, ExprEnd)) + return 0; + + // After parsing the base expression we could either have a parenthesized + // memory address or not. If not, return now. If so, eat the (. + if (getLexer().isNot(AsmToken::LParen)) { + // Unless we have a segment register, treat this as an immediate. + if (SegReg == 0) + return X86Operand::CreateMem(Disp, LParenLoc, ExprEnd); + return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, MemStart, ExprEnd); + } + + // Eat the '('. + Parser.Lex(); + } + } + + // If we reached here, then we just ate the ( of the memory operand. Process + // the rest of the memory operand. + unsigned BaseReg = 0, IndexReg = 0, Scale = 1; + SMLoc IndexLoc; + + if (getLexer().is(AsmToken::Percent)) { + SMLoc StartLoc, EndLoc; + if (ParseRegister(BaseReg, StartLoc, EndLoc)) return 0; + if (BaseReg == X86::EIZ || BaseReg == X86::RIZ) { + Error(StartLoc, "eiz and riz can only be used as index registers", + SMRange(StartLoc, EndLoc)); + return 0; + } + } + + if (getLexer().is(AsmToken::Comma)) { + Parser.Lex(); // Eat the comma. + IndexLoc = Parser.getTok().getLoc(); + + // Following the comma we should have either an index register, or a scale + // value. We don't support the later form, but we want to parse it + // correctly. + // + // Not that even though it would be completely consistent to support syntax + // like "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this. + if (getLexer().is(AsmToken::Percent)) { + SMLoc L; + if (ParseRegister(IndexReg, L, L)) return 0; + + if (getLexer().isNot(AsmToken::RParen)) { + // Parse the scale amount: + // ::= ',' [scale-expression] + if (getLexer().isNot(AsmToken::Comma)) { + Error(Parser.getTok().getLoc(), + "expected comma in scale expression"); + return 0; + } + Parser.Lex(); // Eat the comma. + + if (getLexer().isNot(AsmToken::RParen)) { + SMLoc Loc = Parser.getTok().getLoc(); + + int64_t ScaleVal; + if (getParser().parseAbsoluteExpression(ScaleVal)){ + Error(Loc, "expected scale expression"); + return 0; + } + + // Validate the scale amount. + if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){ + Error(Loc, "scale factor in address must be 1, 2, 4 or 8"); + return 0; + } + Scale = (unsigned)ScaleVal; + } + } + } else if (getLexer().isNot(AsmToken::RParen)) { + // A scale amount without an index is ignored. + // index. + SMLoc Loc = Parser.getTok().getLoc(); + + int64_t Value; + if (getParser().parseAbsoluteExpression(Value)) + return 0; + + if (Value != 1) + Warning(Loc, "scale factor without index register is ignored"); + Scale = 1; + } + } + + // Ok, we've eaten the memory operand, verify we have a ')' and eat it too. + if (getLexer().isNot(AsmToken::RParen)) { + Error(Parser.getTok().getLoc(), "unexpected token in memory operand"); + return 0; + } + SMLoc MemEnd = Parser.getTok().getEndLoc(); + Parser.Lex(); // Eat the ')'. + + // If we have both a base register and an index register make sure they are + // both 64-bit or 32-bit registers. + // To support VSIB, IndexReg can be 128-bit or 256-bit registers. + if (BaseReg != 0 && IndexReg != 0) { + if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) && + (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) || + X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg)) && + IndexReg != X86::RIZ) { + Error(IndexLoc, "index register is 32-bit, but base register is 64-bit"); + return 0; + } + if (X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) && + (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) || + X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) && + IndexReg != X86::EIZ){ + Error(IndexLoc, "index register is 64-bit, but base register is 32-bit"); + return 0; + } + } + + return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, + MemStart, MemEnd); +} + +bool X86AsmParser:: +ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, + SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + InstInfo = &Info; + StringRef PatchedName = Name; + + // FIXME: Hack to recognize setneb as setne. + if (PatchedName.startswith("set") && PatchedName.endswith("b") && + PatchedName != "setb" && PatchedName != "setnb") + PatchedName = PatchedName.substr(0, Name.size()-1); + + // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}. + const MCExpr *ExtraImmOp = 0; + if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) && + (PatchedName.endswith("ss") || PatchedName.endswith("sd") || + PatchedName.endswith("ps") || PatchedName.endswith("pd"))) { + bool IsVCMP = PatchedName[0] == 'v'; + unsigned SSECCIdx = IsVCMP ? 4 : 3; + unsigned SSEComparisonCode = StringSwitch<unsigned>( + PatchedName.slice(SSECCIdx, PatchedName.size() - 2)) + .Case("eq", 0x00) + .Case("lt", 0x01) + .Case("le", 0x02) + .Case("unord", 0x03) + .Case("neq", 0x04) + .Case("nlt", 0x05) + .Case("nle", 0x06) + .Case("ord", 0x07) + /* AVX only from here */ + .Case("eq_uq", 0x08) + .Case("nge", 0x09) + .Case("ngt", 0x0A) + .Case("false", 0x0B) + .Case("neq_oq", 0x0C) + .Case("ge", 0x0D) + .Case("gt", 0x0E) + .Case("true", 0x0F) + .Case("eq_os", 0x10) + .Case("lt_oq", 0x11) + .Case("le_oq", 0x12) + .Case("unord_s", 0x13) + .Case("neq_us", 0x14) + .Case("nlt_uq", 0x15) + .Case("nle_uq", 0x16) + .Case("ord_s", 0x17) + .Case("eq_us", 0x18) + .Case("nge_uq", 0x19) + .Case("ngt_uq", 0x1A) + .Case("false_os", 0x1B) + .Case("neq_os", 0x1C) + .Case("ge_oq", 0x1D) + .Case("gt_oq", 0x1E) + .Case("true_us", 0x1F) + .Default(~0U); + if (SSEComparisonCode != ~0U && (IsVCMP || SSEComparisonCode < 8)) { + ExtraImmOp = MCConstantExpr::Create(SSEComparisonCode, + getParser().getContext()); + if (PatchedName.endswith("ss")) { + PatchedName = IsVCMP ? "vcmpss" : "cmpss"; + } else if (PatchedName.endswith("sd")) { + PatchedName = IsVCMP ? "vcmpsd" : "cmpsd"; + } else if (PatchedName.endswith("ps")) { + PatchedName = IsVCMP ? "vcmpps" : "cmpps"; + } else { + assert(PatchedName.endswith("pd") && "Unexpected mnemonic!"); + PatchedName = IsVCMP ? "vcmppd" : "cmppd"; + } + } + } + + Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc)); + + if (ExtraImmOp && !isParsingIntelSyntax()) + Operands.push_back(X86Operand::CreateImm(ExtraImmOp, NameLoc, NameLoc)); + + // Determine whether this is an instruction prefix. + bool isPrefix = + Name == "lock" || Name == "rep" || + Name == "repe" || Name == "repz" || + Name == "repne" || Name == "repnz" || + Name == "rex64" || Name == "data16"; + + + // This does the actual operand parsing. Don't parse any more if we have a + // prefix juxtaposed with an operation like "lock incl 4(%rax)", because we + // just want to parse the "lock" as the first instruction and the "incl" as + // the next one. + if (getLexer().isNot(AsmToken::EndOfStatement) && !isPrefix) { + + // Parse '*' modifier. + if (getLexer().is(AsmToken::Star)) { + SMLoc Loc = Parser.getTok().getLoc(); + Operands.push_back(X86Operand::CreateToken("*", Loc)); + Parser.Lex(); // Eat the star. + } + + // Read the first operand. + if (X86Operand *Op = ParseOperand()) + Operands.push_back(Op); + else { + Parser.eatToEndOfStatement(); + return true; + } + + while (getLexer().is(AsmToken::Comma)) { + Parser.Lex(); // Eat the comma. + + // Parse and remember the operand. + if (X86Operand *Op = ParseOperand()) + Operands.push_back(Op); + else { + Parser.eatToEndOfStatement(); + return true; + } + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + SMLoc Loc = getLexer().getLoc(); + Parser.eatToEndOfStatement(); + return Error(Loc, "unexpected token in argument list"); + } + } + + if (getLexer().is(AsmToken::EndOfStatement)) + Parser.Lex(); // Consume the EndOfStatement + else if (isPrefix && getLexer().is(AsmToken::Slash)) + Parser.Lex(); // Consume the prefix separator Slash + + if (ExtraImmOp && isParsingIntelSyntax()) + Operands.push_back(X86Operand::CreateImm(ExtraImmOp, NameLoc, NameLoc)); + + // This is a terrible hack to handle "out[bwl]? %al, (%dx)" -> + // "outb %al, %dx". Out doesn't take a memory form, but this is a widely + // documented form in various unofficial manuals, so a lot of code uses it. + if ((Name == "outb" || Name == "outw" || Name == "outl" || Name == "out") && + Operands.size() == 3) { + X86Operand &Op = *(X86Operand*)Operands.back(); + if (Op.isMem() && Op.Mem.SegReg == 0 && + isa<MCConstantExpr>(Op.Mem.Disp) && + cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 && + Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) { + SMLoc Loc = Op.getEndLoc(); + Operands.back() = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc); + delete &Op; + } + } + // Same hack for "in[bwl]? (%dx), %al" -> "inb %dx, %al". + if ((Name == "inb" || Name == "inw" || Name == "inl" || Name == "in") && + Operands.size() == 3) { + X86Operand &Op = *(X86Operand*)Operands.begin()[1]; + if (Op.isMem() && Op.Mem.SegReg == 0 && + isa<MCConstantExpr>(Op.Mem.Disp) && + cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 && + Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) { + SMLoc Loc = Op.getEndLoc(); + Operands.begin()[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc); + delete &Op; + } + } + // Transform "ins[bwl] %dx, %es:(%edi)" into "ins[bwl]" + if (Name.startswith("ins") && Operands.size() == 3 && + (Name == "insb" || Name == "insw" || Name == "insl")) { + X86Operand &Op = *(X86Operand*)Operands.begin()[1]; + X86Operand &Op2 = *(X86Operand*)Operands.begin()[2]; + if (Op.isReg() && Op.getReg() == X86::DX && isDstOp(Op2)) { + Operands.pop_back(); + Operands.pop_back(); + delete &Op; + delete &Op2; + } + } + + // Transform "outs[bwl] %ds:(%esi), %dx" into "out[bwl]" + if (Name.startswith("outs") && Operands.size() == 3 && + (Name == "outsb" || Name == "outsw" || Name == "outsl")) { + X86Operand &Op = *(X86Operand*)Operands.begin()[1]; + X86Operand &Op2 = *(X86Operand*)Operands.begin()[2]; + if (isSrcOp(Op) && Op2.isReg() && Op2.getReg() == X86::DX) { + Operands.pop_back(); + Operands.pop_back(); + delete &Op; + delete &Op2; + } + } + + // Transform "movs[bwl] %ds:(%esi), %es:(%edi)" into "movs[bwl]" + if (Name.startswith("movs") && Operands.size() == 3 && + (Name == "movsb" || Name == "movsw" || Name == "movsl" || + (is64BitMode() && Name == "movsq"))) { + X86Operand &Op = *(X86Operand*)Operands.begin()[1]; + X86Operand &Op2 = *(X86Operand*)Operands.begin()[2]; + if (isSrcOp(Op) && isDstOp(Op2)) { + Operands.pop_back(); + Operands.pop_back(); + delete &Op; + delete &Op2; + } + } + // Transform "lods[bwl] %ds:(%esi),{%al,%ax,%eax,%rax}" into "lods[bwl]" + if (Name.startswith("lods") && Operands.size() == 3 && + (Name == "lods" || Name == "lodsb" || Name == "lodsw" || + Name == "lodsl" || (is64BitMode() && Name == "lodsq"))) { + X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]); + X86Operand *Op2 = static_cast<X86Operand*>(Operands[2]); + if (isSrcOp(*Op1) && Op2->isReg()) { + const char *ins; + unsigned reg = Op2->getReg(); + bool isLods = Name == "lods"; + if (reg == X86::AL && (isLods || Name == "lodsb")) + ins = "lodsb"; + else if (reg == X86::AX && (isLods || Name == "lodsw")) + ins = "lodsw"; + else if (reg == X86::EAX && (isLods || Name == "lodsl")) + ins = "lodsl"; + else if (reg == X86::RAX && (isLods || Name == "lodsq")) + ins = "lodsq"; + else + ins = NULL; + if (ins != NULL) { + Operands.pop_back(); + Operands.pop_back(); + delete Op1; + delete Op2; + if (Name != ins) + static_cast<X86Operand*>(Operands[0])->setTokenValue(ins); + } + } + } + // Transform "stos[bwl] {%al,%ax,%eax,%rax},%es:(%edi)" into "stos[bwl]" + if (Name.startswith("stos") && Operands.size() == 3 && + (Name == "stos" || Name == "stosb" || Name == "stosw" || + Name == "stosl" || (is64BitMode() && Name == "stosq"))) { + X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]); + X86Operand *Op2 = static_cast<X86Operand*>(Operands[2]); + if (isDstOp(*Op2) && Op1->isReg()) { + const char *ins; + unsigned reg = Op1->getReg(); + bool isStos = Name == "stos"; + if (reg == X86::AL && (isStos || Name == "stosb")) + ins = "stosb"; + else if (reg == X86::AX && (isStos || Name == "stosw")) + ins = "stosw"; + else if (reg == X86::EAX && (isStos || Name == "stosl")) + ins = "stosl"; + else if (reg == X86::RAX && (isStos || Name == "stosq")) + ins = "stosq"; + else + ins = NULL; + if (ins != NULL) { + Operands.pop_back(); + Operands.pop_back(); + delete Op1; + delete Op2; + if (Name != ins) + static_cast<X86Operand*>(Operands[0])->setTokenValue(ins); + } + } + } + + // FIXME: Hack to handle recognize s{hr,ar,hl} $1, <op>. Canonicalize to + // "shift <op>". + if ((Name.startswith("shr") || Name.startswith("sar") || + Name.startswith("shl") || Name.startswith("sal") || + Name.startswith("rcl") || Name.startswith("rcr") || + Name.startswith("rol") || Name.startswith("ror")) && + Operands.size() == 3) { + if (isParsingIntelSyntax()) { + // Intel syntax + X86Operand *Op1 = static_cast<X86Operand*>(Operands[2]); + if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) && + cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) { + delete Operands[2]; + Operands.pop_back(); + } + } else { + X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]); + if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) && + cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) { + delete Operands[1]; + Operands.erase(Operands.begin() + 1); + } + } + } + + // Transforms "int $3" into "int3" as a size optimization. We can't write an + // instalias with an immediate operand yet. + if (Name == "int" && Operands.size() == 2) { + X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]); + if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) && + cast<MCConstantExpr>(Op1->getImm())->getValue() == 3) { + delete Operands[1]; + Operands.erase(Operands.begin() + 1); + static_cast<X86Operand*>(Operands[0])->setTokenValue("int3"); + } + } + + return false; +} + +static bool convertToSExti8(MCInst &Inst, unsigned Opcode, unsigned Reg, + bool isCmp) { + MCInst TmpInst; + TmpInst.setOpcode(Opcode); + if (!isCmp) + TmpInst.addOperand(MCOperand::CreateReg(Reg)); + TmpInst.addOperand(MCOperand::CreateReg(Reg)); + TmpInst.addOperand(Inst.getOperand(0)); + Inst = TmpInst; + return true; +} + +static bool convert16i16to16ri8(MCInst &Inst, unsigned Opcode, + bool isCmp = false) { + if (!Inst.getOperand(0).isImm() || + !isImmSExti16i8Value(Inst.getOperand(0).getImm())) + return false; + + return convertToSExti8(Inst, Opcode, X86::AX, isCmp); +} + +static bool convert32i32to32ri8(MCInst &Inst, unsigned Opcode, + bool isCmp = false) { + if (!Inst.getOperand(0).isImm() || + !isImmSExti32i8Value(Inst.getOperand(0).getImm())) + return false; + + return convertToSExti8(Inst, Opcode, X86::EAX, isCmp); +} + +static bool convert64i32to64ri8(MCInst &Inst, unsigned Opcode, + bool isCmp = false) { + if (!Inst.getOperand(0).isImm() || + !isImmSExti64i8Value(Inst.getOperand(0).getImm())) + return false; + + return convertToSExti8(Inst, Opcode, X86::RAX, isCmp); +} + +bool X86AsmParser:: +processInstruction(MCInst &Inst, + const SmallVectorImpl<MCParsedAsmOperand*> &Ops) { + switch (Inst.getOpcode()) { + default: return false; + case X86::AND16i16: return convert16i16to16ri8(Inst, X86::AND16ri8); + case X86::AND32i32: return convert32i32to32ri8(Inst, X86::AND32ri8); + case X86::AND64i32: return convert64i32to64ri8(Inst, X86::AND64ri8); + case X86::XOR16i16: return convert16i16to16ri8(Inst, X86::XOR16ri8); + case X86::XOR32i32: return convert32i32to32ri8(Inst, X86::XOR32ri8); + case X86::XOR64i32: return convert64i32to64ri8(Inst, X86::XOR64ri8); + case X86::OR16i16: return convert16i16to16ri8(Inst, X86::OR16ri8); + case X86::OR32i32: return convert32i32to32ri8(Inst, X86::OR32ri8); + case X86::OR64i32: return convert64i32to64ri8(Inst, X86::OR64ri8); + case X86::CMP16i16: return convert16i16to16ri8(Inst, X86::CMP16ri8, true); + case X86::CMP32i32: return convert32i32to32ri8(Inst, X86::CMP32ri8, true); + case X86::CMP64i32: return convert64i32to64ri8(Inst, X86::CMP64ri8, true); + case X86::ADD16i16: return convert16i16to16ri8(Inst, X86::ADD16ri8); + case X86::ADD32i32: return convert32i32to32ri8(Inst, X86::ADD32ri8); + case X86::ADD64i32: return convert64i32to64ri8(Inst, X86::ADD64ri8); + case X86::SUB16i16: return convert16i16to16ri8(Inst, X86::SUB16ri8); + case X86::SUB32i32: return convert32i32to32ri8(Inst, X86::SUB32ri8); + case X86::SUB64i32: return convert64i32to64ri8(Inst, X86::SUB64ri8); + case X86::ADC16i16: return convert16i16to16ri8(Inst, X86::ADC16ri8); + case X86::ADC32i32: return convert32i32to32ri8(Inst, X86::ADC32ri8); + case X86::ADC64i32: return convert64i32to64ri8(Inst, X86::ADC64ri8); + case X86::SBB16i16: return convert16i16to16ri8(Inst, X86::SBB16ri8); + case X86::SBB32i32: return convert32i32to32ri8(Inst, X86::SBB32ri8); + case X86::SBB64i32: return convert64i32to64ri8(Inst, X86::SBB64ri8); + } +} + +static const char *getSubtargetFeatureName(unsigned Val); +bool X86AsmParser:: +MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + SmallVectorImpl<MCParsedAsmOperand*> &Operands, + MCStreamer &Out, unsigned &ErrorInfo, + bool MatchingInlineAsm) { + assert(!Operands.empty() && "Unexpect empty operand list!"); + X86Operand *Op = static_cast<X86Operand*>(Operands[0]); + assert(Op->isToken() && "Leading operand should always be a mnemonic!"); + ArrayRef<SMRange> EmptyRanges = None; + + // First, handle aliases that expand to multiple instructions. + // FIXME: This should be replaced with a real .td file alias mechanism. + // Also, MatchInstructionImpl should actually *do* the EmitInstruction + // call. + if (Op->getToken() == "fstsw" || Op->getToken() == "fstcw" || + Op->getToken() == "fstsww" || Op->getToken() == "fstcww" || + Op->getToken() == "finit" || Op->getToken() == "fsave" || + Op->getToken() == "fstenv" || Op->getToken() == "fclex") { + MCInst Inst; + Inst.setOpcode(X86::WAIT); + Inst.setLoc(IDLoc); + if (!MatchingInlineAsm) + Out.EmitInstruction(Inst); + + const char *Repl = + StringSwitch<const char*>(Op->getToken()) + .Case("finit", "fninit") + .Case("fsave", "fnsave") + .Case("fstcw", "fnstcw") + .Case("fstcww", "fnstcw") + .Case("fstenv", "fnstenv") + .Case("fstsw", "fnstsw") + .Case("fstsww", "fnstsw") + .Case("fclex", "fnclex") + .Default(0); + assert(Repl && "Unknown wait-prefixed instruction"); + delete Operands[0]; + Operands[0] = X86Operand::CreateToken(Repl, IDLoc); + } + + bool WasOriginallyInvalidOperand = false; + MCInst Inst; + + // First, try a direct match. + switch (MatchInstructionImpl(Operands, Inst, + ErrorInfo, MatchingInlineAsm, + isParsingIntelSyntax())) { + default: break; + case Match_Success: + // Some instructions need post-processing to, for example, tweak which + // encoding is selected. Loop on it while changes happen so the + // individual transformations can chain off each other. + if (!MatchingInlineAsm) + while (processInstruction(Inst, Operands)) + ; + + Inst.setLoc(IDLoc); + if (!MatchingInlineAsm) + Out.EmitInstruction(Inst); + Opcode = Inst.getOpcode(); + return false; + case Match_MissingFeature: { + assert(ErrorInfo && "Unknown missing feature!"); + // Special case the error message for the very common case where only + // a single subtarget feature is missing. + std::string Msg = "instruction requires:"; + unsigned Mask = 1; + for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) { + if (ErrorInfo & Mask) { + Msg += " "; + Msg += getSubtargetFeatureName(ErrorInfo & Mask); + } + Mask <<= 1; + } + return Error(IDLoc, Msg, EmptyRanges, MatchingInlineAsm); + } + case Match_InvalidOperand: + WasOriginallyInvalidOperand = true; + break; + case Match_MnemonicFail: + break; + } + + // FIXME: Ideally, we would only attempt suffix matches for things which are + // valid prefixes, and we could just infer the right unambiguous + // type. However, that requires substantially more matcher support than the + // following hack. + + // Change the operand to point to a temporary token. + StringRef Base = Op->getToken(); + SmallString<16> Tmp; + Tmp += Base; + Tmp += ' '; + Op->setTokenValue(Tmp.str()); + + // If this instruction starts with an 'f', then it is a floating point stack + // instruction. These come in up to three forms for 32-bit, 64-bit, and + // 80-bit floating point, which use the suffixes s,l,t respectively. + // + // Otherwise, we assume that this may be an integer instruction, which comes + // in 8/16/32/64-bit forms using the b,w,l,q suffixes respectively. + const char *Suffixes = Base[0] != 'f' ? "bwlq" : "slt\0"; + + // Check for the various suffix matches. + Tmp[Base.size()] = Suffixes[0]; + unsigned ErrorInfoIgnore; + unsigned ErrorInfoMissingFeature = 0; // Init suppresses compiler warnings. + unsigned Match1, Match2, Match3, Match4; + + Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, + isParsingIntelSyntax()); + // If this returned as a missing feature failure, remember that. + if (Match1 == Match_MissingFeature) + ErrorInfoMissingFeature = ErrorInfoIgnore; + Tmp[Base.size()] = Suffixes[1]; + Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, + isParsingIntelSyntax()); + // If this returned as a missing feature failure, remember that. + if (Match2 == Match_MissingFeature) + ErrorInfoMissingFeature = ErrorInfoIgnore; + Tmp[Base.size()] = Suffixes[2]; + Match3 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, + isParsingIntelSyntax()); + // If this returned as a missing feature failure, remember that. + if (Match3 == Match_MissingFeature) + ErrorInfoMissingFeature = ErrorInfoIgnore; + Tmp[Base.size()] = Suffixes[3]; + Match4 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, + isParsingIntelSyntax()); + // If this returned as a missing feature failure, remember that. + if (Match4 == Match_MissingFeature) + ErrorInfoMissingFeature = ErrorInfoIgnore; + + // Restore the old token. + Op->setTokenValue(Base); + + // If exactly one matched, then we treat that as a successful match (and the + // instruction will already have been filled in correctly, since the failing + // matches won't have modified it). + unsigned NumSuccessfulMatches = + (Match1 == Match_Success) + (Match2 == Match_Success) + + (Match3 == Match_Success) + (Match4 == Match_Success); + if (NumSuccessfulMatches == 1) { + Inst.setLoc(IDLoc); + if (!MatchingInlineAsm) + Out.EmitInstruction(Inst); + Opcode = Inst.getOpcode(); + return false; + } + + // Otherwise, the match failed, try to produce a decent error message. + + // If we had multiple suffix matches, then identify this as an ambiguous + // match. + if (NumSuccessfulMatches > 1) { + char MatchChars[4]; + unsigned NumMatches = 0; + if (Match1 == Match_Success) MatchChars[NumMatches++] = Suffixes[0]; + if (Match2 == Match_Success) MatchChars[NumMatches++] = Suffixes[1]; + if (Match3 == Match_Success) MatchChars[NumMatches++] = Suffixes[2]; + if (Match4 == Match_Success) MatchChars[NumMatches++] = Suffixes[3]; + + SmallString<126> Msg; + raw_svector_ostream OS(Msg); + OS << "ambiguous instructions require an explicit suffix (could be "; + for (unsigned i = 0; i != NumMatches; ++i) { + if (i != 0) + OS << ", "; + if (i + 1 == NumMatches) + OS << "or "; + OS << "'" << Base << MatchChars[i] << "'"; + } + OS << ")"; + Error(IDLoc, OS.str(), EmptyRanges, MatchingInlineAsm); + return true; + } + + // Okay, we know that none of the variants matched successfully. + + // If all of the instructions reported an invalid mnemonic, then the original + // mnemonic was invalid. + if ((Match1 == Match_MnemonicFail) && (Match2 == Match_MnemonicFail) && + (Match3 == Match_MnemonicFail) && (Match4 == Match_MnemonicFail)) { + if (!WasOriginallyInvalidOperand) { + ArrayRef<SMRange> Ranges = MatchingInlineAsm ? EmptyRanges : + Op->getLocRange(); + return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'", + Ranges, MatchingInlineAsm); + } + + // Recover location info for the operand if we know which was the problem. + if (ErrorInfo != ~0U) { + if (ErrorInfo >= Operands.size()) + return Error(IDLoc, "too few operands for instruction", + EmptyRanges, MatchingInlineAsm); + + X86Operand *Operand = (X86Operand*)Operands[ErrorInfo]; + if (Operand->getStartLoc().isValid()) { + SMRange OperandRange = Operand->getLocRange(); + return Error(Operand->getStartLoc(), "invalid operand for instruction", + OperandRange, MatchingInlineAsm); + } + } + + return Error(IDLoc, "invalid operand for instruction", EmptyRanges, + MatchingInlineAsm); + } + + // If one instruction matched with a missing feature, report this as a + // missing feature. + if ((Match1 == Match_MissingFeature) + (Match2 == Match_MissingFeature) + + (Match3 == Match_MissingFeature) + (Match4 == Match_MissingFeature) == 1){ + std::string Msg = "instruction requires:"; + unsigned Mask = 1; + for (unsigned i = 0; i < (sizeof(ErrorInfoMissingFeature)*8-1); ++i) { + if (ErrorInfoMissingFeature & Mask) { + Msg += " "; + Msg += getSubtargetFeatureName(ErrorInfoMissingFeature & Mask); + } + Mask <<= 1; + } + return Error(IDLoc, Msg, EmptyRanges, MatchingInlineAsm); + } + + // If one instruction matched with an invalid operand, report this as an + // operand failure. + if ((Match1 == Match_InvalidOperand) + (Match2 == Match_InvalidOperand) + + (Match3 == Match_InvalidOperand) + (Match4 == Match_InvalidOperand) == 1){ + Error(IDLoc, "invalid operand for instruction", EmptyRanges, + MatchingInlineAsm); + return true; + } + + // If all of these were an outright failure, report it in a useless way. + Error(IDLoc, "unknown use of instruction mnemonic without a size suffix", + EmptyRanges, MatchingInlineAsm); + return true; +} + + +bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { + StringRef IDVal = DirectiveID.getIdentifier(); + if (IDVal == ".word") + return ParseDirectiveWord(2, DirectiveID.getLoc()); + else if (IDVal.startswith(".code")) + return ParseDirectiveCode(IDVal, DirectiveID.getLoc()); + else if (IDVal.startswith(".att_syntax")) { + getParser().setAssemblerDialect(0); + return false; + } else if (IDVal.startswith(".intel_syntax")) { + getParser().setAssemblerDialect(1); + if (getLexer().isNot(AsmToken::EndOfStatement)) { + if(Parser.getTok().getString() == "noprefix") { + // FIXME : Handle noprefix + Parser.Lex(); + } else + return true; + } + return false; + } + return true; +} + +/// ParseDirectiveWord +/// ::= .word [ expression (, expression)* ] +bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { + if (getLexer().isNot(AsmToken::EndOfStatement)) { + for (;;) { + const MCExpr *Value; + if (getParser().parseExpression(Value)) + return true; + + getParser().getStreamer().EmitValue(Value, Size); + + if (getLexer().is(AsmToken::EndOfStatement)) + break; + + // FIXME: Improve diagnostic. + if (getLexer().isNot(AsmToken::Comma)) + return Error(L, "unexpected token in directive"); + Parser.Lex(); + } + } + + Parser.Lex(); + return false; +} + +/// ParseDirectiveCode +/// ::= .code32 | .code64 +bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) { + if (IDVal == ".code32") { + Parser.Lex(); + if (is64BitMode()) { + SwitchMode(); + getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32); + } + } else if (IDVal == ".code64") { + Parser.Lex(); + if (!is64BitMode()) { + SwitchMode(); + getParser().getStreamer().EmitAssemblerFlag(MCAF_Code64); + } + } else { + return Error(L, "unexpected directive " + IDVal); + } + + return false; +} + +// Force static initialization. +extern "C" void LLVMInitializeX86AsmParser() { + RegisterMCAsmParser<X86AsmParser> X(TheX86_32Target); + RegisterMCAsmParser<X86AsmParser> Y(TheX86_64Target); +} + +#define GET_REGISTER_MATCHER +#define GET_MATCHER_IMPLEMENTATION +#define GET_SUBTARGET_FEATURE_NAME +#include "X86GenAsmMatcher.inc" diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp new file mode 100644 index 0000000..ca6f80c --- /dev/null +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -0,0 +1,814 @@ +//===-- X86Disassembler.cpp - Disassembler for x86 and x86_64 -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is part of the X86 Disassembler. +// It contains code to translate the data produced by the decoder into +// MCInsts. +// Documentation for the disassembler can be found in X86Disassembler.h. +// +//===----------------------------------------------------------------------===// + +#include "X86Disassembler.h" +#include "X86DisassemblerDecoder.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MemoryObject.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" + +#define GET_REGINFO_ENUM +#include "X86GenRegisterInfo.inc" +#define GET_INSTRINFO_ENUM +#include "X86GenInstrInfo.inc" + +using namespace llvm; +using namespace llvm::X86Disassembler; + +void x86DisassemblerDebug(const char *file, + unsigned line, + const char *s) { + dbgs() << file << ":" << line << ": " << s; +} + +const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii) { + const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii); + return MII->getName(Opcode); +} + +#define debug(s) DEBUG(x86DisassemblerDebug(__FILE__, __LINE__, s)); + +namespace llvm { + +// Fill-ins to make the compiler happy. These constants are never actually +// assigned; they are just filler to make an automatically-generated switch +// statement work. +namespace X86 { + enum { + BX_SI = 500, + BX_DI = 501, + BP_SI = 502, + BP_DI = 503, + sib = 504, + sib64 = 505 + }; +} + +extern Target TheX86_32Target, TheX86_64Target; + +} + +static bool translateInstruction(MCInst &target, + InternalInstruction &source, + const MCDisassembler *Dis); + +X86GenericDisassembler::X86GenericDisassembler(const MCSubtargetInfo &STI, + DisassemblerMode mode, + const MCInstrInfo *MII) + : MCDisassembler(STI), MII(MII), fMode(mode) {} + +X86GenericDisassembler::~X86GenericDisassembler() { + delete MII; +} + +/// regionReader - a callback function that wraps the readByte method from +/// MemoryObject. +/// +/// @param arg - The generic callback parameter. In this case, this should +/// be a pointer to a MemoryObject. +/// @param byte - A pointer to the byte to be read. +/// @param address - The address to be read. +static int regionReader(const void* arg, uint8_t* byte, uint64_t address) { + const MemoryObject* region = static_cast<const MemoryObject*>(arg); + return region->readByte(address, byte); +} + +/// logger - a callback function that wraps the operator<< method from +/// raw_ostream. +/// +/// @param arg - The generic callback parameter. This should be a pointe +/// to a raw_ostream. +/// @param log - A string to be logged. logger() adds a newline. +static void logger(void* arg, const char* log) { + if (!arg) + return; + + raw_ostream &vStream = *(static_cast<raw_ostream*>(arg)); + vStream << log << "\n"; +} + +// +// Public interface for the disassembler +// + +MCDisassembler::DecodeStatus +X86GenericDisassembler::getInstruction(MCInst &instr, + uint64_t &size, + const MemoryObject ®ion, + uint64_t address, + raw_ostream &vStream, + raw_ostream &cStream) const { + CommentStream = &cStream; + + InternalInstruction internalInstr; + + dlog_t loggerFn = logger; + if (&vStream == &nulls()) + loggerFn = 0; // Disable logging completely if it's going to nulls(). + + int ret = decodeInstruction(&internalInstr, + regionReader, + (const void*)®ion, + loggerFn, + (void*)&vStream, + (const void*)MII, + address, + fMode); + + if (ret) { + size = internalInstr.readerCursor - address; + return Fail; + } + else { + size = internalInstr.length; + return (!translateInstruction(instr, internalInstr, this)) ? + Success : Fail; + } +} + +// +// Private code that translates from struct InternalInstructions to MCInsts. +// + +/// translateRegister - Translates an internal register to the appropriate LLVM +/// register, and appends it as an operand to an MCInst. +/// +/// @param mcInst - The MCInst to append to. +/// @param reg - The Reg to append. +static void translateRegister(MCInst &mcInst, Reg reg) { +#define ENTRY(x) X86::x, + uint8_t llvmRegnums[] = { + ALL_REGS + 0 + }; +#undef ENTRY + + uint8_t llvmRegnum = llvmRegnums[reg]; + mcInst.addOperand(MCOperand::CreateReg(llvmRegnum)); +} + +/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the +/// immediate Value in the MCInst. +/// +/// @param Value - The immediate Value, has had any PC adjustment made by +/// the caller. +/// @param isBranch - If the instruction is a branch instruction +/// @param Address - The starting address of the instruction +/// @param Offset - The byte offset to this immediate in the instruction +/// @param Width - The byte width of this immediate in the instruction +/// +/// If the getOpInfo() function was set when setupForSymbolicDisassembly() was +/// called then that function is called to get any symbolic information for the +/// immediate in the instruction using the Address, Offset and Width. If that +/// returns non-zero then the symbolic information it returns is used to create +/// an MCExpr and that is added as an operand to the MCInst. If getOpInfo() +/// returns zero and isBranch is true then a symbol look up for immediate Value +/// is done and if a symbol is found an MCExpr is created with that, else +/// an MCExpr with the immediate Value is created. This function returns true +/// if it adds an operand to the MCInst and false otherwise. +static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch, + uint64_t Address, uint64_t Offset, + uint64_t Width, MCInst &MI, + const MCDisassembler *Dis) { + LLVMOpInfoCallback getOpInfo = Dis->getLLVMOpInfoCallback(); + struct LLVMOpInfo1 SymbolicOp; + memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1)); + SymbolicOp.Value = Value; + void *DisInfo = Dis->getDisInfoBlock(); + + if (!getOpInfo || + !getOpInfo(DisInfo, Address, Offset, Width, 1, &SymbolicOp)) { + // Clear SymbolicOp.Value from above and also all other fields. + memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1)); + LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback(); + if (!SymbolLookUp) + return false; + uint64_t ReferenceType; + if (isBranch) + ReferenceType = LLVMDisassembler_ReferenceType_In_Branch; + else + ReferenceType = LLVMDisassembler_ReferenceType_InOut_None; + const char *ReferenceName; + const char *Name = SymbolLookUp(DisInfo, Value, &ReferenceType, Address, + &ReferenceName); + if (Name) { + SymbolicOp.AddSymbol.Name = Name; + SymbolicOp.AddSymbol.Present = true; + } + // For branches always create an MCExpr so it gets printed as hex address. + else if (isBranch) { + SymbolicOp.Value = Value; + } + if(ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub) + (*Dis->CommentStream) << "symbol stub for: " << ReferenceName; + if (!Name && !isBranch) + return false; + } + + MCContext *Ctx = Dis->getMCContext(); + const MCExpr *Add = NULL; + if (SymbolicOp.AddSymbol.Present) { + if (SymbolicOp.AddSymbol.Name) { + StringRef Name(SymbolicOp.AddSymbol.Name); + MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name); + Add = MCSymbolRefExpr::Create(Sym, *Ctx); + } else { + Add = MCConstantExpr::Create((int)SymbolicOp.AddSymbol.Value, *Ctx); + } + } + + const MCExpr *Sub = NULL; + if (SymbolicOp.SubtractSymbol.Present) { + if (SymbolicOp.SubtractSymbol.Name) { + StringRef Name(SymbolicOp.SubtractSymbol.Name); + MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name); + Sub = MCSymbolRefExpr::Create(Sym, *Ctx); + } else { + Sub = MCConstantExpr::Create((int)SymbolicOp.SubtractSymbol.Value, *Ctx); + } + } + + const MCExpr *Off = NULL; + if (SymbolicOp.Value != 0) + Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx); + + const MCExpr *Expr; + if (Sub) { + const MCExpr *LHS; + if (Add) + LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx); + else + LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx); + if (Off != 0) + Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx); + else + Expr = LHS; + } else if (Add) { + if (Off != 0) + Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx); + else + Expr = Add; + } else { + if (Off != 0) + Expr = Off; + else + Expr = MCConstantExpr::Create(0, *Ctx); + } + + MI.addOperand(MCOperand::CreateExpr(Expr)); + + return true; +} + +/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being +/// referenced by a load instruction with the base register that is the rip. +/// These can often be addresses in a literal pool. The Address of the +/// instruction and its immediate Value are used to determine the address +/// being referenced in the literal pool entry. The SymbolLookUp call back will +/// return a pointer to a literal 'C' string if the referenced address is an +/// address into a section with 'C' string literals. +static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value, + const void *Decoder) { + const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder); + LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback(); + if (SymbolLookUp) { + void *DisInfo = Dis->getDisInfoBlock(); + uint64_t ReferenceType = LLVMDisassembler_ReferenceType_In_PCrel_Load; + const char *ReferenceName; + (void)SymbolLookUp(DisInfo, Value, &ReferenceType, Address, &ReferenceName); + if(ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr) + (*Dis->CommentStream) << "literal pool for: " << ReferenceName; + } +} + +/// translateImmediate - Appends an immediate operand to an MCInst. +/// +/// @param mcInst - The MCInst to append to. +/// @param immediate - The immediate value to append. +/// @param operand - The operand, as stored in the descriptor table. +/// @param insn - The internal instruction. +static void translateImmediate(MCInst &mcInst, uint64_t immediate, + const OperandSpecifier &operand, + InternalInstruction &insn, + const MCDisassembler *Dis) { + // Sign-extend the immediate if necessary. + + OperandType type = (OperandType)operand.type; + + bool isBranch = false; + uint64_t pcrel = 0; + if (type == TYPE_RELv) { + isBranch = true; + pcrel = insn.startLocation + + insn.immediateOffset + insn.immediateSize; + switch (insn.displacementSize) { + default: + break; + case 1: + type = TYPE_MOFFS8; + break; + case 2: + type = TYPE_MOFFS16; + break; + case 4: + type = TYPE_MOFFS32; + break; + case 8: + type = TYPE_MOFFS64; + break; + } + } + // By default sign-extend all X86 immediates based on their encoding. + else if (type == TYPE_IMM8 || type == TYPE_IMM16 || type == TYPE_IMM32 || + type == TYPE_IMM64) { + uint32_t Opcode = mcInst.getOpcode(); + switch (operand.encoding) { + default: + break; + case ENCODING_IB: + // Special case those X86 instructions that use the imm8 as a set of + // bits, bit count, etc. and are not sign-extend. + if (Opcode != X86::BLENDPSrri && Opcode != X86::BLENDPDrri && + Opcode != X86::PBLENDWrri && Opcode != X86::MPSADBWrri && + Opcode != X86::DPPSrri && Opcode != X86::DPPDrri && + Opcode != X86::INSERTPSrr && Opcode != X86::VBLENDPSYrri && + Opcode != X86::VBLENDPSYrmi && Opcode != X86::VBLENDPDYrri && + Opcode != X86::VBLENDPDYrmi && Opcode != X86::VPBLENDWrri && + Opcode != X86::VMPSADBWrri && Opcode != X86::VDPPSYrri && + Opcode != X86::VDPPSYrmi && Opcode != X86::VDPPDrri && + Opcode != X86::VINSERTPSrr) + type = TYPE_MOFFS8; + break; + case ENCODING_IW: + type = TYPE_MOFFS16; + break; + case ENCODING_ID: + type = TYPE_MOFFS32; + break; + case ENCODING_IO: + type = TYPE_MOFFS64; + break; + } + } + + switch (type) { + case TYPE_XMM32: + case TYPE_XMM64: + case TYPE_XMM128: + mcInst.addOperand(MCOperand::CreateReg(X86::XMM0 + (immediate >> 4))); + return; + case TYPE_XMM256: + mcInst.addOperand(MCOperand::CreateReg(X86::YMM0 + (immediate >> 4))); + return; + case TYPE_REL8: + isBranch = true; + pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize; + // fall through to sign extend the immediate if needed. + case TYPE_MOFFS8: + if(immediate & 0x80) + immediate |= ~(0xffull); + break; + case TYPE_MOFFS16: + if(immediate & 0x8000) + immediate |= ~(0xffffull); + break; + case TYPE_REL32: + case TYPE_REL64: + isBranch = true; + pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize; + // fall through to sign extend the immediate if needed. + case TYPE_MOFFS32: + if(immediate & 0x80000000) + immediate |= ~(0xffffffffull); + break; + case TYPE_MOFFS64: + default: + // operand is 64 bits wide. Do nothing. + break; + } + + if(!tryAddingSymbolicOperand(immediate + pcrel, isBranch, insn.startLocation, + insn.immediateOffset, insn.immediateSize, + mcInst, Dis)) + mcInst.addOperand(MCOperand::CreateImm(immediate)); +} + +/// translateRMRegister - Translates a register stored in the R/M field of the +/// ModR/M byte to its LLVM equivalent and appends it to an MCInst. +/// @param mcInst - The MCInst to append to. +/// @param insn - The internal instruction to extract the R/M field +/// from. +/// @return - 0 on success; -1 otherwise +static bool translateRMRegister(MCInst &mcInst, + InternalInstruction &insn) { + if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) { + debug("A R/M register operand may not have a SIB byte"); + return true; + } + + switch (insn.eaBase) { + default: + debug("Unexpected EA base register"); + return true; + case EA_BASE_NONE: + debug("EA_BASE_NONE for ModR/M base"); + return true; +#define ENTRY(x) case EA_BASE_##x: + ALL_EA_BASES +#undef ENTRY + debug("A R/M register operand may not have a base; " + "the operand must be a register."); + return true; +#define ENTRY(x) \ + case EA_REG_##x: \ + mcInst.addOperand(MCOperand::CreateReg(X86::x)); break; + ALL_REGS +#undef ENTRY + } + + return false; +} + +/// translateRMMemory - Translates a memory operand stored in the Mod and R/M +/// fields of an internal instruction (and possibly its SIB byte) to a memory +/// operand in LLVM's format, and appends it to an MCInst. +/// +/// @param mcInst - The MCInst to append to. +/// @param insn - The instruction to extract Mod, R/M, and SIB fields +/// from. +/// @return - 0 on success; nonzero otherwise +static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, + const MCDisassembler *Dis) { + // Addresses in an MCInst are represented as five operands: + // 1. basereg (register) The R/M base, or (if there is a SIB) the + // SIB base + // 2. scaleamount (immediate) 1, or (if there is a SIB) the specified + // scale amount + // 3. indexreg (register) x86_registerNONE, or (if there is a SIB) + // the index (which is multiplied by the + // scale amount) + // 4. displacement (immediate) 0, or the displacement if there is one + // 5. segmentreg (register) x86_registerNONE for now, but could be set + // if we have segment overrides + + MCOperand baseReg; + MCOperand scaleAmount; + MCOperand indexReg; + MCOperand displacement; + MCOperand segmentReg; + uint64_t pcrel = 0; + + if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) { + if (insn.sibBase != SIB_BASE_NONE) { + switch (insn.sibBase) { + default: + debug("Unexpected sibBase"); + return true; +#define ENTRY(x) \ + case SIB_BASE_##x: \ + baseReg = MCOperand::CreateReg(X86::x); break; + ALL_SIB_BASES +#undef ENTRY + } + } else { + baseReg = MCOperand::CreateReg(0); + } + + // Check whether we are handling VSIB addressing mode for GATHER. + // If sibIndex was set to SIB_INDEX_NONE, index offset is 4 and + // we should use SIB_INDEX_XMM4|YMM4 for VSIB. + // I don't see a way to get the correct IndexReg in readSIB: + // We can tell whether it is VSIB or SIB after instruction ID is decoded, + // but instruction ID may not be decoded yet when calling readSIB. + uint32_t Opcode = mcInst.getOpcode(); + bool IndexIs128 = (Opcode == X86::VGATHERDPDrm || + Opcode == X86::VGATHERDPDYrm || + Opcode == X86::VGATHERQPDrm || + Opcode == X86::VGATHERDPSrm || + Opcode == X86::VGATHERQPSrm || + Opcode == X86::VPGATHERDQrm || + Opcode == X86::VPGATHERDQYrm || + Opcode == X86::VPGATHERQQrm || + Opcode == X86::VPGATHERDDrm || + Opcode == X86::VPGATHERQDrm); + bool IndexIs256 = (Opcode == X86::VGATHERQPDYrm || + Opcode == X86::VGATHERDPSYrm || + Opcode == X86::VGATHERQPSYrm || + Opcode == X86::VPGATHERQQYrm || + Opcode == X86::VPGATHERDDYrm || + Opcode == X86::VPGATHERQDYrm); + if (IndexIs128 || IndexIs256) { + unsigned IndexOffset = insn.sibIndex - + (insn.addressSize == 8 ? SIB_INDEX_RAX:SIB_INDEX_EAX); + SIBIndex IndexBase = IndexIs256 ? SIB_INDEX_YMM0 : SIB_INDEX_XMM0; + insn.sibIndex = (SIBIndex)(IndexBase + + (insn.sibIndex == SIB_INDEX_NONE ? 4 : IndexOffset)); + } + + if (insn.sibIndex != SIB_INDEX_NONE) { + switch (insn.sibIndex) { + default: + debug("Unexpected sibIndex"); + return true; +#define ENTRY(x) \ + case SIB_INDEX_##x: \ + indexReg = MCOperand::CreateReg(X86::x); break; + EA_BASES_32BIT + EA_BASES_64BIT + REGS_XMM + REGS_YMM +#undef ENTRY + } + } else { + indexReg = MCOperand::CreateReg(0); + } + + scaleAmount = MCOperand::CreateImm(insn.sibScale); + } else { + switch (insn.eaBase) { + case EA_BASE_NONE: + if (insn.eaDisplacement == EA_DISP_NONE) { + debug("EA_BASE_NONE and EA_DISP_NONE for ModR/M base"); + return true; + } + if (insn.mode == MODE_64BIT){ + pcrel = insn.startLocation + + insn.displacementOffset + insn.displacementSize; + tryAddingPcLoadReferenceComment(insn.startLocation + + insn.displacementOffset, + insn.displacement + pcrel, Dis); + baseReg = MCOperand::CreateReg(X86::RIP); // Section 2.2.1.6 + } + else + baseReg = MCOperand::CreateReg(0); + + indexReg = MCOperand::CreateReg(0); + break; + case EA_BASE_BX_SI: + baseReg = MCOperand::CreateReg(X86::BX); + indexReg = MCOperand::CreateReg(X86::SI); + break; + case EA_BASE_BX_DI: + baseReg = MCOperand::CreateReg(X86::BX); + indexReg = MCOperand::CreateReg(X86::DI); + break; + case EA_BASE_BP_SI: + baseReg = MCOperand::CreateReg(X86::BP); + indexReg = MCOperand::CreateReg(X86::SI); + break; + case EA_BASE_BP_DI: + baseReg = MCOperand::CreateReg(X86::BP); + indexReg = MCOperand::CreateReg(X86::DI); + break; + default: + indexReg = MCOperand::CreateReg(0); + switch (insn.eaBase) { + default: + debug("Unexpected eaBase"); + return true; + // Here, we will use the fill-ins defined above. However, + // BX_SI, BX_DI, BP_SI, and BP_DI are all handled above and + // sib and sib64 were handled in the top-level if, so they're only + // placeholders to keep the compiler happy. +#define ENTRY(x) \ + case EA_BASE_##x: \ + baseReg = MCOperand::CreateReg(X86::x); break; + ALL_EA_BASES +#undef ENTRY +#define ENTRY(x) case EA_REG_##x: + ALL_REGS +#undef ENTRY + debug("A R/M memory operand may not be a register; " + "the base field must be a base."); + return true; + } + } + + scaleAmount = MCOperand::CreateImm(1); + } + + displacement = MCOperand::CreateImm(insn.displacement); + + static const uint8_t segmentRegnums[SEG_OVERRIDE_max] = { + 0, // SEG_OVERRIDE_NONE + X86::CS, + X86::SS, + X86::DS, + X86::ES, + X86::FS, + X86::GS + }; + + segmentReg = MCOperand::CreateReg(segmentRegnums[insn.segmentOverride]); + + mcInst.addOperand(baseReg); + mcInst.addOperand(scaleAmount); + mcInst.addOperand(indexReg); + if(!tryAddingSymbolicOperand(insn.displacement + pcrel, false, + insn.startLocation, insn.displacementOffset, + insn.displacementSize, mcInst, Dis)) + mcInst.addOperand(displacement); + mcInst.addOperand(segmentReg); + return false; +} + +/// translateRM - Translates an operand stored in the R/M (and possibly SIB) +/// byte of an instruction to LLVM form, and appends it to an MCInst. +/// +/// @param mcInst - The MCInst to append to. +/// @param operand - The operand, as stored in the descriptor table. +/// @param insn - The instruction to extract Mod, R/M, and SIB fields +/// from. +/// @return - 0 on success; nonzero otherwise +static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, + InternalInstruction &insn, const MCDisassembler *Dis) { + switch (operand.type) { + default: + debug("Unexpected type for a R/M operand"); + return true; + case TYPE_R8: + case TYPE_R16: + case TYPE_R32: + case TYPE_R64: + case TYPE_Rv: + case TYPE_MM: + case TYPE_MM32: + case TYPE_MM64: + case TYPE_XMM: + case TYPE_XMM32: + case TYPE_XMM64: + case TYPE_XMM128: + case TYPE_XMM256: + case TYPE_DEBUGREG: + case TYPE_CONTROLREG: + return translateRMRegister(mcInst, insn); + case TYPE_M: + case TYPE_M8: + case TYPE_M16: + case TYPE_M32: + case TYPE_M64: + case TYPE_M128: + case TYPE_M256: + case TYPE_M512: + case TYPE_Mv: + case TYPE_M32FP: + case TYPE_M64FP: + case TYPE_M80FP: + case TYPE_M16INT: + case TYPE_M32INT: + case TYPE_M64INT: + case TYPE_M1616: + case TYPE_M1632: + case TYPE_M1664: + case TYPE_LEA: + return translateRMMemory(mcInst, insn, Dis); + } +} + +/// translateFPRegister - Translates a stack position on the FPU stack to its +/// LLVM form, and appends it to an MCInst. +/// +/// @param mcInst - The MCInst to append to. +/// @param stackPos - The stack position to translate. +/// @return - 0 on success; nonzero otherwise. +static bool translateFPRegister(MCInst &mcInst, + uint8_t stackPos) { + if (stackPos >= 8) { + debug("Invalid FP stack position"); + return true; + } + + mcInst.addOperand(MCOperand::CreateReg(X86::ST0 + stackPos)); + + return false; +} + +/// translateOperand - Translates an operand stored in an internal instruction +/// to LLVM's format and appends it to an MCInst. +/// +/// @param mcInst - The MCInst to append to. +/// @param operand - The operand, as stored in the descriptor table. +/// @param insn - The internal instruction. +/// @return - false on success; true otherwise. +static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand, + InternalInstruction &insn, + const MCDisassembler *Dis) { + switch (operand.encoding) { + default: + debug("Unhandled operand encoding during translation"); + return true; + case ENCODING_REG: + translateRegister(mcInst, insn.reg); + return false; + case ENCODING_RM: + return translateRM(mcInst, operand, insn, Dis); + case ENCODING_CB: + case ENCODING_CW: + case ENCODING_CD: + case ENCODING_CP: + case ENCODING_CO: + case ENCODING_CT: + debug("Translation of code offsets isn't supported."); + return true; + case ENCODING_IB: + case ENCODING_IW: + case ENCODING_ID: + case ENCODING_IO: + case ENCODING_Iv: + case ENCODING_Ia: + translateImmediate(mcInst, + insn.immediates[insn.numImmediatesTranslated++], + operand, + insn, + Dis); + return false; + case ENCODING_RB: + case ENCODING_RW: + case ENCODING_RD: + case ENCODING_RO: + translateRegister(mcInst, insn.opcodeRegister); + return false; + case ENCODING_I: + return translateFPRegister(mcInst, insn.opcodeModifier); + case ENCODING_Rv: + translateRegister(mcInst, insn.opcodeRegister); + return false; + case ENCODING_VVVV: + translateRegister(mcInst, insn.vvvv); + return false; + case ENCODING_DUP: + return translateOperand(mcInst, insn.operands[operand.type - TYPE_DUP0], + insn, Dis); + } +} + +/// translateInstruction - Translates an internal instruction and all its +/// operands to an MCInst. +/// +/// @param mcInst - The MCInst to populate with the instruction's data. +/// @param insn - The internal instruction. +/// @return - false on success; true otherwise. +static bool translateInstruction(MCInst &mcInst, + InternalInstruction &insn, + const MCDisassembler *Dis) { + if (!insn.spec) { + debug("Instruction has no specification"); + return true; + } + + mcInst.setOpcode(insn.instructionID); + + int index; + + insn.numImmediatesTranslated = 0; + + for (index = 0; index < X86_MAX_OPERANDS; ++index) { + if (insn.operands[index].encoding != ENCODING_NONE) { + if (translateOperand(mcInst, insn.operands[index], insn, Dis)) { + return true; + } + } + } + + return false; +} + +static MCDisassembler *createX86_32Disassembler(const Target &T, + const MCSubtargetInfo &STI) { + return new X86Disassembler::X86GenericDisassembler(STI, MODE_32BIT, + T.createMCInstrInfo()); +} + +static MCDisassembler *createX86_64Disassembler(const Target &T, + const MCSubtargetInfo &STI) { + return new X86Disassembler::X86GenericDisassembler(STI, MODE_64BIT, + T.createMCInstrInfo()); +} + +extern "C" void LLVMInitializeX86Disassembler() { + // Register the disassembler. + TargetRegistry::RegisterMCDisassembler(TheX86_32Target, + createX86_32Disassembler); + TargetRegistry::RegisterMCDisassembler(TheX86_64Target, + createX86_64Disassembler); +} diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h new file mode 100644 index 0000000..b92427a --- /dev/null +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h @@ -0,0 +1,131 @@ +//===-- X86Disassembler.h - Disassembler for x86 and x86_64 -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// The X86 disassembler is a table-driven disassembler for the 16-, 32-, and +// 64-bit X86 instruction sets. The main decode sequence for an assembly +// instruction in this disassembler is: +// +// 1. Read the prefix bytes and determine the attributes of the instruction. +// These attributes, recorded in enum attributeBits +// (X86DisassemblerDecoderCommon.h), form a bitmask. The table CONTEXTS_SYM +// provides a mapping from bitmasks to contexts, which are represented by +// enum InstructionContext (ibid.). +// +// 2. Read the opcode, and determine what kind of opcode it is. The +// disassembler distinguishes four kinds of opcodes, which are enumerated in +// OpcodeType (X86DisassemblerDecoderCommon.h): one-byte (0xnn), two-byte +// (0x0f 0xnn), three-byte-38 (0x0f 0x38 0xnn), or three-byte-3a +// (0x0f 0x3a 0xnn). Mandatory prefixes are treated as part of the context. +// +// 3. Depending on the opcode type, look in one of four ClassDecision structures +// (X86DisassemblerDecoderCommon.h). Use the opcode class to determine which +// OpcodeDecision (ibid.) to look the opcode in. Look up the opcode, to get +// a ModRMDecision (ibid.). +// +// 4. Some instructions, such as escape opcodes or extended opcodes, or even +// instructions that have ModRM*Reg / ModRM*Mem forms in LLVM, need the +// ModR/M byte to complete decode. The ModRMDecision's type is an entry from +// ModRMDecisionType (X86DisassemblerDecoderCommon.h) that indicates if the +// ModR/M byte is required and how to interpret it. +// +// 5. After resolving the ModRMDecision, the disassembler has a unique ID +// of type InstrUID (X86DisassemblerDecoderCommon.h). Looking this ID up in +// INSTRUCTIONS_SYM yields the name of the instruction and the encodings and +// meanings of its operands. +// +// 6. For each operand, its encoding is an entry from OperandEncoding +// (X86DisassemblerDecoderCommon.h) and its type is an entry from +// OperandType (ibid.). The encoding indicates how to read it from the +// instruction; the type indicates how to interpret the value once it has +// been read. For example, a register operand could be stored in the R/M +// field of the ModR/M byte, the REG field of the ModR/M byte, or added to +// the main opcode. This is orthogonal from its meaning (an GPR or an XMM +// register, for instance). Given this information, the operands can be +// extracted and interpreted. +// +// 7. As the last step, the disassembler translates the instruction information +// and operands into a format understandable by the client - in this case, an +// MCInst for use by the MC infrastructure. +// +// The disassembler is broken broadly into two parts: the table emitter that +// emits the instruction decode tables discussed above during compilation, and +// the disassembler itself. The table emitter is documented in more detail in +// utils/TableGen/X86DisassemblerEmitter.h. +// +// X86Disassembler.h contains the public interface for the disassembler, +// adhering to the MCDisassembler interface. +// X86Disassembler.cpp contains the code responsible for step 7, and for +// invoking the decoder to execute steps 1-6. +// X86DisassemblerDecoderCommon.h contains the definitions needed by both the +// table emitter and the disassembler. +// X86DisassemblerDecoder.h contains the public interface of the decoder, +// factored out into C for possible use by other projects. +// X86DisassemblerDecoder.c contains the source code of the decoder, which is +// responsible for steps 1-6. +// +//===----------------------------------------------------------------------===// + +#ifndef X86DISASSEMBLER_H +#define X86DISASSEMBLER_H + +#define INSTRUCTION_SPECIFIER_FIELDS \ + uint16_t operands; + +#define INSTRUCTION_IDS \ + uint16_t instructionIDs; + +#include "X86DisassemblerDecoderCommon.h" + +#undef INSTRUCTION_SPECIFIER_FIELDS +#undef INSTRUCTION_IDS + +#include "llvm/MC/MCDisassembler.h" + +namespace llvm { + +class MCInst; +class MCInstrInfo; +class MCSubtargetInfo; +class MemoryObject; +class raw_ostream; + +namespace X86Disassembler { + +/// X86GenericDisassembler - Generic disassembler for all X86 platforms. +/// All each platform class should have to do is subclass the constructor, and +/// provide a different disassemblerMode value. +class X86GenericDisassembler : public MCDisassembler { + const MCInstrInfo *MII; +public: + /// Constructor - Initializes the disassembler. + /// + /// @param mode - The X86 architecture mode to decode for. + X86GenericDisassembler(const MCSubtargetInfo &STI, DisassemblerMode mode, + const MCInstrInfo *MII); +private: + ~X86GenericDisassembler(); +public: + + /// getInstruction - See MCDisassembler. + DecodeStatus getInstruction(MCInst &instr, + uint64_t &size, + const MemoryObject ®ion, + uint64_t address, + raw_ostream &vStream, + raw_ostream &cStream) const; + +private: + DisassemblerMode fMode; +}; + +} // namespace X86Disassembler + +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c new file mode 100644 index 0000000..e40edba --- /dev/null +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c @@ -0,0 +1,1676 @@ +/*===-- X86DisassemblerDecoder.c - Disassembler decoder ------------*- C -*-===* + * + * The LLVM Compiler Infrastructure + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + * + *===----------------------------------------------------------------------===* + * + * This file is part of the X86 Disassembler. + * It contains the implementation of the instruction decoder. + * Documentation for the disassembler can be found in X86Disassembler.h. + * + *===----------------------------------------------------------------------===*/ + +#include <stdarg.h> /* for va_*() */ +#include <stdio.h> /* for vsnprintf() */ +#include <stdlib.h> /* for exit() */ +#include <string.h> /* for memset() */ + +#include "X86DisassemblerDecoder.h" + +#include "X86GenDisassemblerTables.inc" + +#define TRUE 1 +#define FALSE 0 + +typedef int8_t bool; + +#ifndef NDEBUG +#define debug(s) do { x86DisassemblerDebug(__FILE__, __LINE__, s); } while (0) +#else +#define debug(s) do { } while (0) +#endif + + +/* + * contextForAttrs - Client for the instruction context table. Takes a set of + * attributes and returns the appropriate decode context. + * + * @param attrMask - Attributes, from the enumeration attributeBits. + * @return - The InstructionContext to use when looking up an + * an instruction with these attributes. + */ +static InstructionContext contextForAttrs(uint8_t attrMask) { + return CONTEXTS_SYM[attrMask]; +} + +/* + * modRMRequired - Reads the appropriate instruction table to determine whether + * the ModR/M byte is required to decode a particular instruction. + * + * @param type - The opcode type (i.e., how many bytes it has). + * @param insnContext - The context for the instruction, as returned by + * contextForAttrs. + * @param opcode - The last byte of the instruction's opcode, not counting + * ModR/M extensions and escapes. + * @return - TRUE if the ModR/M byte is required, FALSE otherwise. + */ +static int modRMRequired(OpcodeType type, + InstructionContext insnContext, + uint8_t opcode) { + const struct ContextDecision* decision = 0; + + switch (type) { + case ONEBYTE: + decision = &ONEBYTE_SYM; + break; + case TWOBYTE: + decision = &TWOBYTE_SYM; + break; + case THREEBYTE_38: + decision = &THREEBYTE38_SYM; + break; + case THREEBYTE_3A: + decision = &THREEBYTE3A_SYM; + break; + case THREEBYTE_A6: + decision = &THREEBYTEA6_SYM; + break; + case THREEBYTE_A7: + decision = &THREEBYTEA7_SYM; + break; + } + + return decision->opcodeDecisions[insnContext].modRMDecisions[opcode]. + modrm_type != MODRM_ONEENTRY; +} + +/* + * decode - Reads the appropriate instruction table to obtain the unique ID of + * an instruction. + * + * @param type - See modRMRequired(). + * @param insnContext - See modRMRequired(). + * @param opcode - See modRMRequired(). + * @param modRM - The ModR/M byte if required, or any value if not. + * @return - The UID of the instruction, or 0 on failure. + */ +static InstrUID decode(OpcodeType type, + InstructionContext insnContext, + uint8_t opcode, + uint8_t modRM) { + const struct ModRMDecision* dec = 0; + + switch (type) { + case ONEBYTE: + dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case TWOBYTE: + dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case THREEBYTE_38: + dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case THREEBYTE_3A: + dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case THREEBYTE_A6: + dec = &THREEBYTEA6_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case THREEBYTE_A7: + dec = &THREEBYTEA7_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + } + + switch (dec->modrm_type) { + default: + debug("Corrupt table! Unknown modrm_type"); + return 0; + case MODRM_ONEENTRY: + return modRMTable[dec->instructionIDs]; + case MODRM_SPLITRM: + if (modFromModRM(modRM) == 0x3) + return modRMTable[dec->instructionIDs+1]; + return modRMTable[dec->instructionIDs]; + case MODRM_SPLITREG: + if (modFromModRM(modRM) == 0x3) + return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)+8]; + return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)]; + case MODRM_SPLITMISC: + if (modFromModRM(modRM) == 0x3) + return modRMTable[dec->instructionIDs+(modRM & 0x3f)+8]; + return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)]; + case MODRM_FULL: + return modRMTable[dec->instructionIDs+modRM]; + } +} + +/* + * specifierForUID - Given a UID, returns the name and operand specification for + * that instruction. + * + * @param uid - The unique ID for the instruction. This should be returned by + * decode(); specifierForUID will not check bounds. + * @return - A pointer to the specification for that instruction. + */ +static const struct InstructionSpecifier *specifierForUID(InstrUID uid) { + return &INSTRUCTIONS_SYM[uid]; +} + +/* + * consumeByte - Uses the reader function provided by the user to consume one + * byte from the instruction's memory and advance the cursor. + * + * @param insn - The instruction with the reader function to use. The cursor + * for this instruction is advanced. + * @param byte - A pointer to a pre-allocated memory buffer to be populated + * with the data read. + * @return - 0 if the read was successful; nonzero otherwise. + */ +static int consumeByte(struct InternalInstruction* insn, uint8_t* byte) { + int ret = insn->reader(insn->readerArg, byte, insn->readerCursor); + + if (!ret) + ++(insn->readerCursor); + + return ret; +} + +/* + * lookAtByte - Like consumeByte, but does not advance the cursor. + * + * @param insn - See consumeByte(). + * @param byte - See consumeByte(). + * @return - See consumeByte(). + */ +static int lookAtByte(struct InternalInstruction* insn, uint8_t* byte) { + return insn->reader(insn->readerArg, byte, insn->readerCursor); +} + +static void unconsumeByte(struct InternalInstruction* insn) { + insn->readerCursor--; +} + +#define CONSUME_FUNC(name, type) \ + static int name(struct InternalInstruction* insn, type* ptr) { \ + type combined = 0; \ + unsigned offset; \ + for (offset = 0; offset < sizeof(type); ++offset) { \ + uint8_t byte; \ + int ret = insn->reader(insn->readerArg, \ + &byte, \ + insn->readerCursor + offset); \ + if (ret) \ + return ret; \ + combined = combined | ((uint64_t)byte << (offset * 8)); \ + } \ + *ptr = combined; \ + insn->readerCursor += sizeof(type); \ + return 0; \ + } + +/* + * consume* - Use the reader function provided by the user to consume data + * values of various sizes from the instruction's memory and advance the + * cursor appropriately. These readers perform endian conversion. + * + * @param insn - See consumeByte(). + * @param ptr - A pointer to a pre-allocated memory of appropriate size to + * be populated with the data read. + * @return - See consumeByte(). + */ +CONSUME_FUNC(consumeInt8, int8_t) +CONSUME_FUNC(consumeInt16, int16_t) +CONSUME_FUNC(consumeInt32, int32_t) +CONSUME_FUNC(consumeUInt16, uint16_t) +CONSUME_FUNC(consumeUInt32, uint32_t) +CONSUME_FUNC(consumeUInt64, uint64_t) + +/* + * dbgprintf - Uses the logging function provided by the user to log a single + * message, typically without a carriage-return. + * + * @param insn - The instruction containing the logging function. + * @param format - See printf(). + * @param ... - See printf(). + */ +static void dbgprintf(struct InternalInstruction* insn, + const char* format, + ...) { + char buffer[256]; + va_list ap; + + if (!insn->dlog) + return; + + va_start(ap, format); + (void)vsnprintf(buffer, sizeof(buffer), format, ap); + va_end(ap); + + insn->dlog(insn->dlogArg, buffer); + + return; +} + +/* + * setPrefixPresent - Marks that a particular prefix is present at a particular + * location. + * + * @param insn - The instruction to be marked as having the prefix. + * @param prefix - The prefix that is present. + * @param location - The location where the prefix is located (in the address + * space of the instruction's reader). + */ +static void setPrefixPresent(struct InternalInstruction* insn, + uint8_t prefix, + uint64_t location) +{ + insn->prefixPresent[prefix] = 1; + insn->prefixLocations[prefix] = location; +} + +/* + * isPrefixAtLocation - Queries an instruction to determine whether a prefix is + * present at a given location. + * + * @param insn - The instruction to be queried. + * @param prefix - The prefix. + * @param location - The location to query. + * @return - Whether the prefix is at that location. + */ +static BOOL isPrefixAtLocation(struct InternalInstruction* insn, + uint8_t prefix, + uint64_t location) +{ + if (insn->prefixPresent[prefix] == 1 && + insn->prefixLocations[prefix] == location) + return TRUE; + else + return FALSE; +} + +/* + * readPrefixes - Consumes all of an instruction's prefix bytes, and marks the + * instruction as having them. Also sets the instruction's default operand, + * address, and other relevant data sizes to report operands correctly. + * + * @param insn - The instruction whose prefixes are to be read. + * @return - 0 if the instruction could be read until the end of the prefix + * bytes, and no prefixes conflicted; nonzero otherwise. + */ +static int readPrefixes(struct InternalInstruction* insn) { + BOOL isPrefix = TRUE; + BOOL prefixGroups[4] = { FALSE }; + uint64_t prefixLocation; + uint8_t byte = 0; + + BOOL hasAdSize = FALSE; + BOOL hasOpSize = FALSE; + + dbgprintf(insn, "readPrefixes()"); + + while (isPrefix) { + prefixLocation = insn->readerCursor; + + if (consumeByte(insn, &byte)) + return -1; + + /* + * If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then + * break and let it be disassembled as a normal "instruction". + */ + if (insn->readerCursor - 1 == insn->startLocation + && (byte == 0xf0 || byte == 0xf2 || byte == 0xf3)) { + uint8_t nextByte; + if (byte == 0xf0) + break; + if (lookAtByte(insn, &nextByte)) + return -1; + if (insn->mode == MODE_64BIT && (nextByte & 0xf0) == 0x40) { + if (consumeByte(insn, &nextByte)) + return -1; + if (lookAtByte(insn, &nextByte)) + return -1; + unconsumeByte(insn); + } + if (nextByte != 0x0f && nextByte != 0x90) + break; + } + + switch (byte) { + case 0xf0: /* LOCK */ + case 0xf2: /* REPNE/REPNZ */ + case 0xf3: /* REP or REPE/REPZ */ + if (prefixGroups[0]) + dbgprintf(insn, "Redundant Group 1 prefix"); + prefixGroups[0] = TRUE; + setPrefixPresent(insn, byte, prefixLocation); + break; + case 0x2e: /* CS segment override -OR- Branch not taken */ + case 0x36: /* SS segment override -OR- Branch taken */ + case 0x3e: /* DS segment override */ + case 0x26: /* ES segment override */ + case 0x64: /* FS segment override */ + case 0x65: /* GS segment override */ + switch (byte) { + case 0x2e: + insn->segmentOverride = SEG_OVERRIDE_CS; + break; + case 0x36: + insn->segmentOverride = SEG_OVERRIDE_SS; + break; + case 0x3e: + insn->segmentOverride = SEG_OVERRIDE_DS; + break; + case 0x26: + insn->segmentOverride = SEG_OVERRIDE_ES; + break; + case 0x64: + insn->segmentOverride = SEG_OVERRIDE_FS; + break; + case 0x65: + insn->segmentOverride = SEG_OVERRIDE_GS; + break; + default: + debug("Unhandled override"); + return -1; + } + if (prefixGroups[1]) + dbgprintf(insn, "Redundant Group 2 prefix"); + prefixGroups[1] = TRUE; + setPrefixPresent(insn, byte, prefixLocation); + break; + case 0x66: /* Operand-size override */ + if (prefixGroups[2]) + dbgprintf(insn, "Redundant Group 3 prefix"); + prefixGroups[2] = TRUE; + hasOpSize = TRUE; + setPrefixPresent(insn, byte, prefixLocation); + break; + case 0x67: /* Address-size override */ + if (prefixGroups[3]) + dbgprintf(insn, "Redundant Group 4 prefix"); + prefixGroups[3] = TRUE; + hasAdSize = TRUE; + setPrefixPresent(insn, byte, prefixLocation); + break; + default: /* Not a prefix byte */ + isPrefix = FALSE; + break; + } + + if (isPrefix) + dbgprintf(insn, "Found prefix 0x%hhx", byte); + } + + insn->vexSize = 0; + + if (byte == 0xc4) { + uint8_t byte1; + + if (lookAtByte(insn, &byte1)) { + dbgprintf(insn, "Couldn't read second byte of VEX"); + return -1; + } + + if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) { + insn->vexSize = 3; + insn->necessaryPrefixLocation = insn->readerCursor - 1; + } + else { + unconsumeByte(insn); + insn->necessaryPrefixLocation = insn->readerCursor - 1; + } + + if (insn->vexSize == 3) { + insn->vexPrefix[0] = byte; + consumeByte(insn, &insn->vexPrefix[1]); + consumeByte(insn, &insn->vexPrefix[2]); + + /* We simulate the REX prefix for simplicity's sake */ + + if (insn->mode == MODE_64BIT) { + insn->rexPrefix = 0x40 + | (wFromVEX3of3(insn->vexPrefix[2]) << 3) + | (rFromVEX2of3(insn->vexPrefix[1]) << 2) + | (xFromVEX2of3(insn->vexPrefix[1]) << 1) + | (bFromVEX2of3(insn->vexPrefix[1]) << 0); + } + + switch (ppFromVEX3of3(insn->vexPrefix[2])) + { + default: + break; + case VEX_PREFIX_66: + hasOpSize = TRUE; + break; + } + + dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1], insn->vexPrefix[2]); + } + } + else if (byte == 0xc5) { + uint8_t byte1; + + if (lookAtByte(insn, &byte1)) { + dbgprintf(insn, "Couldn't read second byte of VEX"); + return -1; + } + + if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) { + insn->vexSize = 2; + } + else { + unconsumeByte(insn); + } + + if (insn->vexSize == 2) { + insn->vexPrefix[0] = byte; + consumeByte(insn, &insn->vexPrefix[1]); + + if (insn->mode == MODE_64BIT) { + insn->rexPrefix = 0x40 + | (rFromVEX2of2(insn->vexPrefix[1]) << 2); + } + + switch (ppFromVEX2of2(insn->vexPrefix[1])) + { + default: + break; + case VEX_PREFIX_66: + hasOpSize = TRUE; + break; + } + + dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1]); + } + } + else { + if (insn->mode == MODE_64BIT) { + if ((byte & 0xf0) == 0x40) { + uint8_t opcodeByte; + + if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) { + dbgprintf(insn, "Redundant REX prefix"); + return -1; + } + + insn->rexPrefix = byte; + insn->necessaryPrefixLocation = insn->readerCursor - 2; + + dbgprintf(insn, "Found REX prefix 0x%hhx", byte); + } else { + unconsumeByte(insn); + insn->necessaryPrefixLocation = insn->readerCursor - 1; + } + } else { + unconsumeByte(insn); + insn->necessaryPrefixLocation = insn->readerCursor - 1; + } + } + + if (insn->mode == MODE_16BIT) { + insn->registerSize = (hasOpSize ? 4 : 2); + insn->addressSize = (hasAdSize ? 4 : 2); + insn->displacementSize = (hasAdSize ? 4 : 2); + insn->immediateSize = (hasOpSize ? 4 : 2); + } else if (insn->mode == MODE_32BIT) { + insn->registerSize = (hasOpSize ? 2 : 4); + insn->addressSize = (hasAdSize ? 2 : 4); + insn->displacementSize = (hasAdSize ? 2 : 4); + insn->immediateSize = (hasOpSize ? 2 : 4); + } else if (insn->mode == MODE_64BIT) { + if (insn->rexPrefix && wFromREX(insn->rexPrefix)) { + insn->registerSize = 8; + insn->addressSize = (hasAdSize ? 4 : 8); + insn->displacementSize = 4; + insn->immediateSize = 4; + } else if (insn->rexPrefix) { + insn->registerSize = (hasOpSize ? 2 : 4); + insn->addressSize = (hasAdSize ? 4 : 8); + insn->displacementSize = (hasOpSize ? 2 : 4); + insn->immediateSize = (hasOpSize ? 2 : 4); + } else { + insn->registerSize = (hasOpSize ? 2 : 4); + insn->addressSize = (hasAdSize ? 4 : 8); + insn->displacementSize = (hasOpSize ? 2 : 4); + insn->immediateSize = (hasOpSize ? 2 : 4); + } + } + + return 0; +} + +/* + * readOpcode - Reads the opcode (excepting the ModR/M byte in the case of + * extended or escape opcodes). + * + * @param insn - The instruction whose opcode is to be read. + * @return - 0 if the opcode could be read successfully; nonzero otherwise. + */ +static int readOpcode(struct InternalInstruction* insn) { + /* Determine the length of the primary opcode */ + + uint8_t current; + + dbgprintf(insn, "readOpcode()"); + + insn->opcodeType = ONEBYTE; + + if (insn->vexSize == 3) + { + switch (mmmmmFromVEX2of3(insn->vexPrefix[1])) + { + default: + dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)", mmmmmFromVEX2of3(insn->vexPrefix[1])); + return -1; + case 0: + break; + case VEX_LOB_0F: + insn->twoByteEscape = 0x0f; + insn->opcodeType = TWOBYTE; + return consumeByte(insn, &insn->opcode); + case VEX_LOB_0F38: + insn->twoByteEscape = 0x0f; + insn->threeByteEscape = 0x38; + insn->opcodeType = THREEBYTE_38; + return consumeByte(insn, &insn->opcode); + case VEX_LOB_0F3A: + insn->twoByteEscape = 0x0f; + insn->threeByteEscape = 0x3a; + insn->opcodeType = THREEBYTE_3A; + return consumeByte(insn, &insn->opcode); + } + } + else if (insn->vexSize == 2) + { + insn->twoByteEscape = 0x0f; + insn->opcodeType = TWOBYTE; + return consumeByte(insn, &insn->opcode); + } + + if (consumeByte(insn, ¤t)) + return -1; + + if (current == 0x0f) { + dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current); + + insn->twoByteEscape = current; + + if (consumeByte(insn, ¤t)) + return -1; + + if (current == 0x38) { + dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); + + insn->threeByteEscape = current; + + if (consumeByte(insn, ¤t)) + return -1; + + insn->opcodeType = THREEBYTE_38; + } else if (current == 0x3a) { + dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); + + insn->threeByteEscape = current; + + if (consumeByte(insn, ¤t)) + return -1; + + insn->opcodeType = THREEBYTE_3A; + } else if (current == 0xa6) { + dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); + + insn->threeByteEscape = current; + + if (consumeByte(insn, ¤t)) + return -1; + + insn->opcodeType = THREEBYTE_A6; + } else if (current == 0xa7) { + dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); + + insn->threeByteEscape = current; + + if (consumeByte(insn, ¤t)) + return -1; + + insn->opcodeType = THREEBYTE_A7; + } else { + dbgprintf(insn, "Didn't find a three-byte escape prefix"); + + insn->opcodeType = TWOBYTE; + } + } + + /* + * At this point we have consumed the full opcode. + * Anything we consume from here on must be unconsumed. + */ + + insn->opcode = current; + + return 0; +} + +static int readModRM(struct InternalInstruction* insn); + +/* + * getIDWithAttrMask - Determines the ID of an instruction, consuming + * the ModR/M byte as appropriate for extended and escape opcodes, + * and using a supplied attribute mask. + * + * @param instructionID - A pointer whose target is filled in with the ID of the + * instruction. + * @param insn - The instruction whose ID is to be determined. + * @param attrMask - The attribute mask to search. + * @return - 0 if the ModR/M could be read when needed or was not + * needed; nonzero otherwise. + */ +static int getIDWithAttrMask(uint16_t* instructionID, + struct InternalInstruction* insn, + uint8_t attrMask) { + BOOL hasModRMExtension; + + uint8_t instructionClass; + + instructionClass = contextForAttrs(attrMask); + + hasModRMExtension = modRMRequired(insn->opcodeType, + instructionClass, + insn->opcode); + + if (hasModRMExtension) { + if (readModRM(insn)) + return -1; + + *instructionID = decode(insn->opcodeType, + instructionClass, + insn->opcode, + insn->modRM); + } else { + *instructionID = decode(insn->opcodeType, + instructionClass, + insn->opcode, + 0); + } + + return 0; +} + +/* + * is16BitEquivalent - Determines whether two instruction names refer to + * equivalent instructions but one is 16-bit whereas the other is not. + * + * @param orig - The instruction that is not 16-bit + * @param equiv - The instruction that is 16-bit + */ +static BOOL is16BitEquivalent(const char* orig, const char* equiv) { + off_t i; + + for (i = 0;; i++) { + if (orig[i] == '\0' && equiv[i] == '\0') + return TRUE; + if (orig[i] == '\0' || equiv[i] == '\0') + return FALSE; + if (orig[i] != equiv[i]) { + if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W') + continue; + if ((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1') + continue; + if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6') + continue; + return FALSE; + } + } +} + +/* + * getID - Determines the ID of an instruction, consuming the ModR/M byte as + * appropriate for extended and escape opcodes. Determines the attributes and + * context for the instruction before doing so. + * + * @param insn - The instruction whose ID is to be determined. + * @return - 0 if the ModR/M could be read when needed or was not needed; + * nonzero otherwise. + */ +static int getID(struct InternalInstruction* insn, const void *miiArg) { + uint8_t attrMask; + uint16_t instructionID; + + dbgprintf(insn, "getID()"); + + attrMask = ATTR_NONE; + + if (insn->mode == MODE_64BIT) + attrMask |= ATTR_64BIT; + + if (insn->vexSize) { + attrMask |= ATTR_VEX; + + if (insn->vexSize == 3) { + switch (ppFromVEX3of3(insn->vexPrefix[2])) { + case VEX_PREFIX_66: + attrMask |= ATTR_OPSIZE; + break; + case VEX_PREFIX_F3: + attrMask |= ATTR_XS; + break; + case VEX_PREFIX_F2: + attrMask |= ATTR_XD; + break; + } + + if (lFromVEX3of3(insn->vexPrefix[2])) + attrMask |= ATTR_VEXL; + } + else if (insn->vexSize == 2) { + switch (ppFromVEX2of2(insn->vexPrefix[1])) { + case VEX_PREFIX_66: + attrMask |= ATTR_OPSIZE; + break; + case VEX_PREFIX_F3: + attrMask |= ATTR_XS; + break; + case VEX_PREFIX_F2: + attrMask |= ATTR_XD; + break; + } + + if (lFromVEX2of2(insn->vexPrefix[1])) + attrMask |= ATTR_VEXL; + } + else { + return -1; + } + } + else { + if (isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) + attrMask |= ATTR_OPSIZE; + else if (isPrefixAtLocation(insn, 0x67, insn->necessaryPrefixLocation)) + attrMask |= ATTR_ADSIZE; + else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation)) + attrMask |= ATTR_XS; + else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation)) + attrMask |= ATTR_XD; + } + + if (insn->rexPrefix & 0x08) + attrMask |= ATTR_REXW; + + if (getIDWithAttrMask(&instructionID, insn, attrMask)) + return -1; + + /* The following clauses compensate for limitations of the tables. */ + + if ((attrMask & ATTR_VEXL) && (attrMask & ATTR_REXW) && + !(attrMask & ATTR_OPSIZE)) { + /* + * Some VEX instructions ignore the L-bit, but use the W-bit. Normally L-bit + * has precedence since there are no L-bit with W-bit entries in the tables. + * So if the L-bit isn't significant we should use the W-bit instead. + * We only need to do this if the instruction doesn't specify OpSize since + * there is a VEX_L_W_OPSIZE table. + */ + + const struct InstructionSpecifier *spec; + uint16_t instructionIDWithWBit; + const struct InstructionSpecifier *specWithWBit; + + spec = specifierForUID(instructionID); + + if (getIDWithAttrMask(&instructionIDWithWBit, + insn, + (attrMask & (~ATTR_VEXL)) | ATTR_REXW)) { + insn->instructionID = instructionID; + insn->spec = spec; + return 0; + } + + specWithWBit = specifierForUID(instructionIDWithWBit); + + if (instructionID != instructionIDWithWBit) { + insn->instructionID = instructionIDWithWBit; + insn->spec = specWithWBit; + } else { + insn->instructionID = instructionID; + insn->spec = spec; + } + return 0; + } + + if (insn->prefixPresent[0x66] && !(attrMask & ATTR_OPSIZE)) { + /* + * The instruction tables make no distinction between instructions that + * allow OpSize anywhere (i.e., 16-bit operations) and that need it in a + * particular spot (i.e., many MMX operations). In general we're + * conservative, but in the specific case where OpSize is present but not + * in the right place we check if there's a 16-bit operation. + */ + + const struct InstructionSpecifier *spec; + uint16_t instructionIDWithOpsize; + const char *specName, *specWithOpSizeName; + + spec = specifierForUID(instructionID); + + if (getIDWithAttrMask(&instructionIDWithOpsize, + insn, + attrMask | ATTR_OPSIZE)) { + /* + * ModRM required with OpSize but not present; give up and return version + * without OpSize set + */ + + insn->instructionID = instructionID; + insn->spec = spec; + return 0; + } + + specName = x86DisassemblerGetInstrName(instructionID, miiArg); + specWithOpSizeName = + x86DisassemblerGetInstrName(instructionIDWithOpsize, miiArg); + + if (is16BitEquivalent(specName, specWithOpSizeName)) { + insn->instructionID = instructionIDWithOpsize; + insn->spec = specifierForUID(instructionIDWithOpsize); + } else { + insn->instructionID = instructionID; + insn->spec = spec; + } + return 0; + } + + if (insn->opcodeType == ONEBYTE && insn->opcode == 0x90 && + insn->rexPrefix & 0x01) { + /* + * NOOP shouldn't decode as NOOP if REX.b is set. Instead + * it should decode as XCHG %r8, %eax. + */ + + const struct InstructionSpecifier *spec; + uint16_t instructionIDWithNewOpcode; + const struct InstructionSpecifier *specWithNewOpcode; + + spec = specifierForUID(instructionID); + + /* Borrow opcode from one of the other XCHGar opcodes */ + insn->opcode = 0x91; + + if (getIDWithAttrMask(&instructionIDWithNewOpcode, + insn, + attrMask)) { + insn->opcode = 0x90; + + insn->instructionID = instructionID; + insn->spec = spec; + return 0; + } + + specWithNewOpcode = specifierForUID(instructionIDWithNewOpcode); + + /* Change back */ + insn->opcode = 0x90; + + insn->instructionID = instructionIDWithNewOpcode; + insn->spec = specWithNewOpcode; + + return 0; + } + + insn->instructionID = instructionID; + insn->spec = specifierForUID(insn->instructionID); + + return 0; +} + +/* + * readSIB - Consumes the SIB byte to determine addressing information for an + * instruction. + * + * @param insn - The instruction whose SIB byte is to be read. + * @return - 0 if the SIB byte was successfully read; nonzero otherwise. + */ +static int readSIB(struct InternalInstruction* insn) { + SIBIndex sibIndexBase = 0; + SIBBase sibBaseBase = 0; + uint8_t index, base; + + dbgprintf(insn, "readSIB()"); + + if (insn->consumedSIB) + return 0; + + insn->consumedSIB = TRUE; + + switch (insn->addressSize) { + case 2: + dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode"); + return -1; + break; + case 4: + sibIndexBase = SIB_INDEX_EAX; + sibBaseBase = SIB_BASE_EAX; + break; + case 8: + sibIndexBase = SIB_INDEX_RAX; + sibBaseBase = SIB_BASE_RAX; + break; + } + + if (consumeByte(insn, &insn->sib)) + return -1; + + index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3); + + switch (index) { + case 0x4: + insn->sibIndex = SIB_INDEX_NONE; + break; + default: + insn->sibIndex = (SIBIndex)(sibIndexBase + index); + if (insn->sibIndex == SIB_INDEX_sib || + insn->sibIndex == SIB_INDEX_sib64) + insn->sibIndex = SIB_INDEX_NONE; + break; + } + + switch (scaleFromSIB(insn->sib)) { + case 0: + insn->sibScale = 1; + break; + case 1: + insn->sibScale = 2; + break; + case 2: + insn->sibScale = 4; + break; + case 3: + insn->sibScale = 8; + break; + } + + base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3); + + switch (base) { + case 0x5: + switch (modFromModRM(insn->modRM)) { + case 0x0: + insn->eaDisplacement = EA_DISP_32; + insn->sibBase = SIB_BASE_NONE; + break; + case 0x1: + insn->eaDisplacement = EA_DISP_8; + insn->sibBase = (insn->addressSize == 4 ? + SIB_BASE_EBP : SIB_BASE_RBP); + break; + case 0x2: + insn->eaDisplacement = EA_DISP_32; + insn->sibBase = (insn->addressSize == 4 ? + SIB_BASE_EBP : SIB_BASE_RBP); + break; + case 0x3: + debug("Cannot have Mod = 0b11 and a SIB byte"); + return -1; + } + break; + default: + insn->sibBase = (SIBBase)(sibBaseBase + base); + break; + } + + return 0; +} + +/* + * readDisplacement - Consumes the displacement of an instruction. + * + * @param insn - The instruction whose displacement is to be read. + * @return - 0 if the displacement byte was successfully read; nonzero + * otherwise. + */ +static int readDisplacement(struct InternalInstruction* insn) { + int8_t d8; + int16_t d16; + int32_t d32; + + dbgprintf(insn, "readDisplacement()"); + + if (insn->consumedDisplacement) + return 0; + + insn->consumedDisplacement = TRUE; + insn->displacementOffset = insn->readerCursor - insn->startLocation; + + switch (insn->eaDisplacement) { + case EA_DISP_NONE: + insn->consumedDisplacement = FALSE; + break; + case EA_DISP_8: + if (consumeInt8(insn, &d8)) + return -1; + insn->displacement = d8; + break; + case EA_DISP_16: + if (consumeInt16(insn, &d16)) + return -1; + insn->displacement = d16; + break; + case EA_DISP_32: + if (consumeInt32(insn, &d32)) + return -1; + insn->displacement = d32; + break; + } + + insn->consumedDisplacement = TRUE; + return 0; +} + +/* + * readModRM - Consumes all addressing information (ModR/M byte, SIB byte, and + * displacement) for an instruction and interprets it. + * + * @param insn - The instruction whose addressing information is to be read. + * @return - 0 if the information was successfully read; nonzero otherwise. + */ +static int readModRM(struct InternalInstruction* insn) { + uint8_t mod, rm, reg; + + dbgprintf(insn, "readModRM()"); + + if (insn->consumedModRM) + return 0; + + if (consumeByte(insn, &insn->modRM)) + return -1; + insn->consumedModRM = TRUE; + + mod = modFromModRM(insn->modRM); + rm = rmFromModRM(insn->modRM); + reg = regFromModRM(insn->modRM); + + /* + * This goes by insn->registerSize to pick the correct register, which messes + * up if we're using (say) XMM or 8-bit register operands. That gets fixed in + * fixupReg(). + */ + switch (insn->registerSize) { + case 2: + insn->regBase = MODRM_REG_AX; + insn->eaRegBase = EA_REG_AX; + break; + case 4: + insn->regBase = MODRM_REG_EAX; + insn->eaRegBase = EA_REG_EAX; + break; + case 8: + insn->regBase = MODRM_REG_RAX; + insn->eaRegBase = EA_REG_RAX; + break; + } + + reg |= rFromREX(insn->rexPrefix) << 3; + rm |= bFromREX(insn->rexPrefix) << 3; + + insn->reg = (Reg)(insn->regBase + reg); + + switch (insn->addressSize) { + case 2: + insn->eaBaseBase = EA_BASE_BX_SI; + + switch (mod) { + case 0x0: + if (rm == 0x6) { + insn->eaBase = EA_BASE_NONE; + insn->eaDisplacement = EA_DISP_16; + if (readDisplacement(insn)) + return -1; + } else { + insn->eaBase = (EABase)(insn->eaBaseBase + rm); + insn->eaDisplacement = EA_DISP_NONE; + } + break; + case 0x1: + insn->eaBase = (EABase)(insn->eaBaseBase + rm); + insn->eaDisplacement = EA_DISP_8; + if (readDisplacement(insn)) + return -1; + break; + case 0x2: + insn->eaBase = (EABase)(insn->eaBaseBase + rm); + insn->eaDisplacement = EA_DISP_16; + if (readDisplacement(insn)) + return -1; + break; + case 0x3: + insn->eaBase = (EABase)(insn->eaRegBase + rm); + if (readDisplacement(insn)) + return -1; + break; + } + break; + case 4: + case 8: + insn->eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX); + + switch (mod) { + case 0x0: + insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */ + switch (rm) { + case 0x4: + case 0xc: /* in case REXW.b is set */ + insn->eaBase = (insn->addressSize == 4 ? + EA_BASE_sib : EA_BASE_sib64); + readSIB(insn); + if (readDisplacement(insn)) + return -1; + break; + case 0x5: + insn->eaBase = EA_BASE_NONE; + insn->eaDisplacement = EA_DISP_32; + if (readDisplacement(insn)) + return -1; + break; + default: + insn->eaBase = (EABase)(insn->eaBaseBase + rm); + break; + } + break; + case 0x1: + case 0x2: + insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32); + switch (rm) { + case 0x4: + case 0xc: /* in case REXW.b is set */ + insn->eaBase = EA_BASE_sib; + readSIB(insn); + if (readDisplacement(insn)) + return -1; + break; + default: + insn->eaBase = (EABase)(insn->eaBaseBase + rm); + if (readDisplacement(insn)) + return -1; + break; + } + break; + case 0x3: + insn->eaDisplacement = EA_DISP_NONE; + insn->eaBase = (EABase)(insn->eaRegBase + rm); + break; + } + break; + } /* switch (insn->addressSize) */ + + return 0; +} + +#define GENERIC_FIXUP_FUNC(name, base, prefix) \ + static uint8_t name(struct InternalInstruction *insn, \ + OperandType type, \ + uint8_t index, \ + uint8_t *valid) { \ + *valid = 1; \ + switch (type) { \ + default: \ + debug("Unhandled register type"); \ + *valid = 0; \ + return 0; \ + case TYPE_Rv: \ + return base + index; \ + case TYPE_R8: \ + if (insn->rexPrefix && \ + index >= 4 && index <= 7) { \ + return prefix##_SPL + (index - 4); \ + } else { \ + return prefix##_AL + index; \ + } \ + case TYPE_R16: \ + return prefix##_AX + index; \ + case TYPE_R32: \ + return prefix##_EAX + index; \ + case TYPE_R64: \ + return prefix##_RAX + index; \ + case TYPE_XMM256: \ + return prefix##_YMM0 + index; \ + case TYPE_XMM128: \ + case TYPE_XMM64: \ + case TYPE_XMM32: \ + case TYPE_XMM: \ + return prefix##_XMM0 + index; \ + case TYPE_MM64: \ + case TYPE_MM32: \ + case TYPE_MM: \ + if (index > 7) \ + *valid = 0; \ + return prefix##_MM0 + index; \ + case TYPE_SEGMENTREG: \ + if (index > 5) \ + *valid = 0; \ + return prefix##_ES + index; \ + case TYPE_DEBUGREG: \ + if (index > 7) \ + *valid = 0; \ + return prefix##_DR0 + index; \ + case TYPE_CONTROLREG: \ + if (index > 8) \ + *valid = 0; \ + return prefix##_CR0 + index; \ + } \ + } + +/* + * fixup*Value - Consults an operand type to determine the meaning of the + * reg or R/M field. If the operand is an XMM operand, for example, an + * operand would be XMM0 instead of AX, which readModRM() would otherwise + * misinterpret it as. + * + * @param insn - The instruction containing the operand. + * @param type - The operand type. + * @param index - The existing value of the field as reported by readModRM(). + * @param valid - The address of a uint8_t. The target is set to 1 if the + * field is valid for the register class; 0 if not. + * @return - The proper value. + */ +GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase, MODRM_REG) +GENERIC_FIXUP_FUNC(fixupRMValue, insn->eaRegBase, EA_REG) + +/* + * fixupReg - Consults an operand specifier to determine which of the + * fixup*Value functions to use in correcting readModRM()'ss interpretation. + * + * @param insn - See fixup*Value(). + * @param op - The operand specifier. + * @return - 0 if fixup was successful; -1 if the register returned was + * invalid for its class. + */ +static int fixupReg(struct InternalInstruction *insn, + const struct OperandSpecifier *op) { + uint8_t valid; + + dbgprintf(insn, "fixupReg()"); + + switch ((OperandEncoding)op->encoding) { + default: + debug("Expected a REG or R/M encoding in fixupReg"); + return -1; + case ENCODING_VVVV: + insn->vvvv = (Reg)fixupRegValue(insn, + (OperandType)op->type, + insn->vvvv, + &valid); + if (!valid) + return -1; + break; + case ENCODING_REG: + insn->reg = (Reg)fixupRegValue(insn, + (OperandType)op->type, + insn->reg - insn->regBase, + &valid); + if (!valid) + return -1; + break; + case ENCODING_RM: + if (insn->eaBase >= insn->eaRegBase) { + insn->eaBase = (EABase)fixupRMValue(insn, + (OperandType)op->type, + insn->eaBase - insn->eaRegBase, + &valid); + if (!valid) + return -1; + } + break; + } + + return 0; +} + +/* + * readOpcodeModifier - Reads an operand from the opcode field of an + * instruction. Handles AddRegFrm instructions. + * + * @param insn - The instruction whose opcode field is to be read. + * @param inModRM - Indicates that the opcode field is to be read from the + * ModR/M extension; useful for escape opcodes + * @return - 0 on success; nonzero otherwise. + */ +static int readOpcodeModifier(struct InternalInstruction* insn) { + dbgprintf(insn, "readOpcodeModifier()"); + + if (insn->consumedOpcodeModifier) + return 0; + + insn->consumedOpcodeModifier = TRUE; + + switch (insn->spec->modifierType) { + default: + debug("Unknown modifier type."); + return -1; + case MODIFIER_NONE: + debug("No modifier but an operand expects one."); + return -1; + case MODIFIER_OPCODE: + insn->opcodeModifier = insn->opcode - insn->spec->modifierBase; + return 0; + case MODIFIER_MODRM: + insn->opcodeModifier = insn->modRM - insn->spec->modifierBase; + return 0; + } +} + +/* + * readOpcodeRegister - Reads an operand from the opcode field of an + * instruction and interprets it appropriately given the operand width. + * Handles AddRegFrm instructions. + * + * @param insn - See readOpcodeModifier(). + * @param size - The width (in bytes) of the register being specified. + * 1 means AL and friends, 2 means AX, 4 means EAX, and 8 means + * RAX. + * @return - 0 on success; nonzero otherwise. + */ +static int readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) { + dbgprintf(insn, "readOpcodeRegister()"); + + if (readOpcodeModifier(insn)) + return -1; + + if (size == 0) + size = insn->registerSize; + + switch (size) { + case 1: + insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3) + | insn->opcodeModifier)); + if (insn->rexPrefix && + insn->opcodeRegister >= MODRM_REG_AL + 0x4 && + insn->opcodeRegister < MODRM_REG_AL + 0x8) { + insn->opcodeRegister = (Reg)(MODRM_REG_SPL + + (insn->opcodeRegister - MODRM_REG_AL - 4)); + } + + break; + case 2: + insn->opcodeRegister = (Reg)(MODRM_REG_AX + + ((bFromREX(insn->rexPrefix) << 3) + | insn->opcodeModifier)); + break; + case 4: + insn->opcodeRegister = (Reg)(MODRM_REG_EAX + + ((bFromREX(insn->rexPrefix) << 3) + | insn->opcodeModifier)); + break; + case 8: + insn->opcodeRegister = (Reg)(MODRM_REG_RAX + + ((bFromREX(insn->rexPrefix) << 3) + | insn->opcodeModifier)); + break; + } + + return 0; +} + +/* + * readImmediate - Consumes an immediate operand from an instruction, given the + * desired operand size. + * + * @param insn - The instruction whose operand is to be read. + * @param size - The width (in bytes) of the operand. + * @return - 0 if the immediate was successfully consumed; nonzero + * otherwise. + */ +static int readImmediate(struct InternalInstruction* insn, uint8_t size) { + uint8_t imm8; + uint16_t imm16; + uint32_t imm32; + uint64_t imm64; + + dbgprintf(insn, "readImmediate()"); + + if (insn->numImmediatesConsumed == 2) { + debug("Already consumed two immediates"); + return -1; + } + + if (size == 0) + size = insn->immediateSize; + else + insn->immediateSize = size; + insn->immediateOffset = insn->readerCursor - insn->startLocation; + + switch (size) { + case 1: + if (consumeByte(insn, &imm8)) + return -1; + insn->immediates[insn->numImmediatesConsumed] = imm8; + break; + case 2: + if (consumeUInt16(insn, &imm16)) + return -1; + insn->immediates[insn->numImmediatesConsumed] = imm16; + break; + case 4: + if (consumeUInt32(insn, &imm32)) + return -1; + insn->immediates[insn->numImmediatesConsumed] = imm32; + break; + case 8: + if (consumeUInt64(insn, &imm64)) + return -1; + insn->immediates[insn->numImmediatesConsumed] = imm64; + break; + } + + insn->numImmediatesConsumed++; + + return 0; +} + +/* + * readVVVV - Consumes vvvv from an instruction if it has a VEX prefix. + * + * @param insn - The instruction whose operand is to be read. + * @return - 0 if the vvvv was successfully consumed; nonzero + * otherwise. + */ +static int readVVVV(struct InternalInstruction* insn) { + dbgprintf(insn, "readVVVV()"); + + if (insn->vexSize == 3) + insn->vvvv = vvvvFromVEX3of3(insn->vexPrefix[2]); + else if (insn->vexSize == 2) + insn->vvvv = vvvvFromVEX2of2(insn->vexPrefix[1]); + else + return -1; + + if (insn->mode != MODE_64BIT) + insn->vvvv &= 0x7; + + return 0; +} + +/* + * readOperands - Consults the specifier for an instruction and consumes all + * operands for that instruction, interpreting them as it goes. + * + * @param insn - The instruction whose operands are to be read and interpreted. + * @return - 0 if all operands could be read; nonzero otherwise. + */ +static int readOperands(struct InternalInstruction* insn) { + int index; + int hasVVVV, needVVVV; + int sawRegImm = 0; + + dbgprintf(insn, "readOperands()"); + + /* If non-zero vvvv specified, need to make sure one of the operands + uses it. */ + hasVVVV = !readVVVV(insn); + needVVVV = hasVVVV && (insn->vvvv != 0); + + for (index = 0; index < X86_MAX_OPERANDS; ++index) { + switch (x86OperandSets[insn->spec->operands][index].encoding) { + case ENCODING_NONE: + break; + case ENCODING_REG: + case ENCODING_RM: + if (readModRM(insn)) + return -1; + if (fixupReg(insn, &x86OperandSets[insn->spec->operands][index])) + return -1; + break; + case ENCODING_CB: + case ENCODING_CW: + case ENCODING_CD: + case ENCODING_CP: + case ENCODING_CO: + case ENCODING_CT: + dbgprintf(insn, "We currently don't hande code-offset encodings"); + return -1; + case ENCODING_IB: + if (sawRegImm) { + /* Saw a register immediate so don't read again and instead split the + previous immediate. FIXME: This is a hack. */ + insn->immediates[insn->numImmediatesConsumed] = + insn->immediates[insn->numImmediatesConsumed - 1] & 0xf; + ++insn->numImmediatesConsumed; + break; + } + if (readImmediate(insn, 1)) + return -1; + if (x86OperandSets[insn->spec->operands][index].type == TYPE_IMM3 && + insn->immediates[insn->numImmediatesConsumed - 1] > 7) + return -1; + if (x86OperandSets[insn->spec->operands][index].type == TYPE_IMM5 && + insn->immediates[insn->numImmediatesConsumed - 1] > 31) + return -1; + if (x86OperandSets[insn->spec->operands][index].type == TYPE_XMM128 || + x86OperandSets[insn->spec->operands][index].type == TYPE_XMM256) + sawRegImm = 1; + break; + case ENCODING_IW: + if (readImmediate(insn, 2)) + return -1; + break; + case ENCODING_ID: + if (readImmediate(insn, 4)) + return -1; + break; + case ENCODING_IO: + if (readImmediate(insn, 8)) + return -1; + break; + case ENCODING_Iv: + if (readImmediate(insn, insn->immediateSize)) + return -1; + break; + case ENCODING_Ia: + if (readImmediate(insn, insn->addressSize)) + return -1; + break; + case ENCODING_RB: + if (readOpcodeRegister(insn, 1)) + return -1; + break; + case ENCODING_RW: + if (readOpcodeRegister(insn, 2)) + return -1; + break; + case ENCODING_RD: + if (readOpcodeRegister(insn, 4)) + return -1; + break; + case ENCODING_RO: + if (readOpcodeRegister(insn, 8)) + return -1; + break; + case ENCODING_Rv: + if (readOpcodeRegister(insn, 0)) + return -1; + break; + case ENCODING_I: + if (readOpcodeModifier(insn)) + return -1; + break; + case ENCODING_VVVV: + needVVVV = 0; /* Mark that we have found a VVVV operand. */ + if (!hasVVVV) + return -1; + if (fixupReg(insn, &x86OperandSets[insn->spec->operands][index])) + return -1; + break; + case ENCODING_DUP: + break; + default: + dbgprintf(insn, "Encountered an operand with an unknown encoding."); + return -1; + } + } + + /* If we didn't find ENCODING_VVVV operand, but non-zero vvvv present, fail */ + if (needVVVV) return -1; + + return 0; +} + +/* + * decodeInstruction - Reads and interprets a full instruction provided by the + * user. + * + * @param insn - A pointer to the instruction to be populated. Must be + * pre-allocated. + * @param reader - The function to be used to read the instruction's bytes. + * @param readerArg - A generic argument to be passed to the reader to store + * any internal state. + * @param logger - If non-NULL, the function to be used to write log messages + * and warnings. + * @param loggerArg - A generic argument to be passed to the logger to store + * any internal state. + * @param startLoc - The address (in the reader's address space) of the first + * byte in the instruction. + * @param mode - The mode (real mode, IA-32e, or IA-32e in 64-bit mode) to + * decode the instruction in. + * @return - 0 if the instruction's memory could be read; nonzero if + * not. + */ +int decodeInstruction(struct InternalInstruction* insn, + byteReader_t reader, + const void* readerArg, + dlog_t logger, + void* loggerArg, + const void* miiArg, + uint64_t startLoc, + DisassemblerMode mode) { + memset(insn, 0, sizeof(struct InternalInstruction)); + + insn->reader = reader; + insn->readerArg = readerArg; + insn->dlog = logger; + insn->dlogArg = loggerArg; + insn->startLocation = startLoc; + insn->readerCursor = startLoc; + insn->mode = mode; + insn->numImmediatesConsumed = 0; + + if (readPrefixes(insn) || + readOpcode(insn) || + getID(insn, miiArg) || + insn->instructionID == 0 || + readOperands(insn)) + return -1; + + insn->operands = &x86OperandSets[insn->spec->operands][0]; + + insn->length = insn->readerCursor - insn->startLocation; + + dbgprintf(insn, "Read from 0x%llx to 0x%llx: length %zu", + startLoc, insn->readerCursor, insn->length); + + if (insn->length > 15) + dbgprintf(insn, "Instruction exceeds 15-byte limit"); + + return 0; +} diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h new file mode 100644 index 0000000..407ead3 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -0,0 +1,588 @@ +/*===-- X86DisassemblerDecoderInternal.h - Disassembler decoder ---*- C -*-===* + * + * The LLVM Compiler Infrastructure + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + * + *===----------------------------------------------------------------------===* + * + * This file is part of the X86 Disassembler. + * It contains the public interface of the instruction decoder. + * Documentation for the disassembler can be found in X86Disassembler.h. + * + *===----------------------------------------------------------------------===*/ + +#ifndef X86DISASSEMBLERDECODER_H +#define X86DISASSEMBLERDECODER_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define INSTRUCTION_SPECIFIER_FIELDS \ + uint16_t operands; + +#define INSTRUCTION_IDS \ + uint16_t instructionIDs; + +#include "X86DisassemblerDecoderCommon.h" + +#undef INSTRUCTION_SPECIFIER_FIELDS +#undef INSTRUCTION_IDS + +/* + * Accessor functions for various fields of an Intel instruction + */ +#define modFromModRM(modRM) (((modRM) & 0xc0) >> 6) +#define regFromModRM(modRM) (((modRM) & 0x38) >> 3) +#define rmFromModRM(modRM) ((modRM) & 0x7) +#define scaleFromSIB(sib) (((sib) & 0xc0) >> 6) +#define indexFromSIB(sib) (((sib) & 0x38) >> 3) +#define baseFromSIB(sib) ((sib) & 0x7) +#define wFromREX(rex) (((rex) & 0x8) >> 3) +#define rFromREX(rex) (((rex) & 0x4) >> 2) +#define xFromREX(rex) (((rex) & 0x2) >> 1) +#define bFromREX(rex) ((rex) & 0x1) + +#define rFromVEX2of3(vex) (((~(vex)) & 0x80) >> 7) +#define xFromVEX2of3(vex) (((~(vex)) & 0x40) >> 6) +#define bFromVEX2of3(vex) (((~(vex)) & 0x20) >> 5) +#define mmmmmFromVEX2of3(vex) ((vex) & 0x1f) +#define wFromVEX3of3(vex) (((vex) & 0x80) >> 7) +#define vvvvFromVEX3of3(vex) (((~(vex)) & 0x78) >> 3) +#define lFromVEX3of3(vex) (((vex) & 0x4) >> 2) +#define ppFromVEX3of3(vex) ((vex) & 0x3) + +#define rFromVEX2of2(vex) (((~(vex)) & 0x80) >> 7) +#define vvvvFromVEX2of2(vex) (((~(vex)) & 0x78) >> 3) +#define lFromVEX2of2(vex) (((vex) & 0x4) >> 2) +#define ppFromVEX2of2(vex) ((vex) & 0x3) + +/* + * These enums represent Intel registers for use by the decoder. + */ + +#define REGS_8BIT \ + ENTRY(AL) \ + ENTRY(CL) \ + ENTRY(DL) \ + ENTRY(BL) \ + ENTRY(AH) \ + ENTRY(CH) \ + ENTRY(DH) \ + ENTRY(BH) \ + ENTRY(R8B) \ + ENTRY(R9B) \ + ENTRY(R10B) \ + ENTRY(R11B) \ + ENTRY(R12B) \ + ENTRY(R13B) \ + ENTRY(R14B) \ + ENTRY(R15B) \ + ENTRY(SPL) \ + ENTRY(BPL) \ + ENTRY(SIL) \ + ENTRY(DIL) + +#define EA_BASES_16BIT \ + ENTRY(BX_SI) \ + ENTRY(BX_DI) \ + ENTRY(BP_SI) \ + ENTRY(BP_DI) \ + ENTRY(SI) \ + ENTRY(DI) \ + ENTRY(BP) \ + ENTRY(BX) \ + ENTRY(R8W) \ + ENTRY(R9W) \ + ENTRY(R10W) \ + ENTRY(R11W) \ + ENTRY(R12W) \ + ENTRY(R13W) \ + ENTRY(R14W) \ + ENTRY(R15W) + +#define REGS_16BIT \ + ENTRY(AX) \ + ENTRY(CX) \ + ENTRY(DX) \ + ENTRY(BX) \ + ENTRY(SP) \ + ENTRY(BP) \ + ENTRY(SI) \ + ENTRY(DI) \ + ENTRY(R8W) \ + ENTRY(R9W) \ + ENTRY(R10W) \ + ENTRY(R11W) \ + ENTRY(R12W) \ + ENTRY(R13W) \ + ENTRY(R14W) \ + ENTRY(R15W) + +#define EA_BASES_32BIT \ + ENTRY(EAX) \ + ENTRY(ECX) \ + ENTRY(EDX) \ + ENTRY(EBX) \ + ENTRY(sib) \ + ENTRY(EBP) \ + ENTRY(ESI) \ + ENTRY(EDI) \ + ENTRY(R8D) \ + ENTRY(R9D) \ + ENTRY(R10D) \ + ENTRY(R11D) \ + ENTRY(R12D) \ + ENTRY(R13D) \ + ENTRY(R14D) \ + ENTRY(R15D) + +#define REGS_32BIT \ + ENTRY(EAX) \ + ENTRY(ECX) \ + ENTRY(EDX) \ + ENTRY(EBX) \ + ENTRY(ESP) \ + ENTRY(EBP) \ + ENTRY(ESI) \ + ENTRY(EDI) \ + ENTRY(R8D) \ + ENTRY(R9D) \ + ENTRY(R10D) \ + ENTRY(R11D) \ + ENTRY(R12D) \ + ENTRY(R13D) \ + ENTRY(R14D) \ + ENTRY(R15D) + +#define EA_BASES_64BIT \ + ENTRY(RAX) \ + ENTRY(RCX) \ + ENTRY(RDX) \ + ENTRY(RBX) \ + ENTRY(sib64) \ + ENTRY(RBP) \ + ENTRY(RSI) \ + ENTRY(RDI) \ + ENTRY(R8) \ + ENTRY(R9) \ + ENTRY(R10) \ + ENTRY(R11) \ + ENTRY(R12) \ + ENTRY(R13) \ + ENTRY(R14) \ + ENTRY(R15) + +#define REGS_64BIT \ + ENTRY(RAX) \ + ENTRY(RCX) \ + ENTRY(RDX) \ + ENTRY(RBX) \ + ENTRY(RSP) \ + ENTRY(RBP) \ + ENTRY(RSI) \ + ENTRY(RDI) \ + ENTRY(R8) \ + ENTRY(R9) \ + ENTRY(R10) \ + ENTRY(R11) \ + ENTRY(R12) \ + ENTRY(R13) \ + ENTRY(R14) \ + ENTRY(R15) + +#define REGS_MMX \ + ENTRY(MM0) \ + ENTRY(MM1) \ + ENTRY(MM2) \ + ENTRY(MM3) \ + ENTRY(MM4) \ + ENTRY(MM5) \ + ENTRY(MM6) \ + ENTRY(MM7) + +#define REGS_XMM \ + ENTRY(XMM0) \ + ENTRY(XMM1) \ + ENTRY(XMM2) \ + ENTRY(XMM3) \ + ENTRY(XMM4) \ + ENTRY(XMM5) \ + ENTRY(XMM6) \ + ENTRY(XMM7) \ + ENTRY(XMM8) \ + ENTRY(XMM9) \ + ENTRY(XMM10) \ + ENTRY(XMM11) \ + ENTRY(XMM12) \ + ENTRY(XMM13) \ + ENTRY(XMM14) \ + ENTRY(XMM15) + +#define REGS_YMM \ + ENTRY(YMM0) \ + ENTRY(YMM1) \ + ENTRY(YMM2) \ + ENTRY(YMM3) \ + ENTRY(YMM4) \ + ENTRY(YMM5) \ + ENTRY(YMM6) \ + ENTRY(YMM7) \ + ENTRY(YMM8) \ + ENTRY(YMM9) \ + ENTRY(YMM10) \ + ENTRY(YMM11) \ + ENTRY(YMM12) \ + ENTRY(YMM13) \ + ENTRY(YMM14) \ + ENTRY(YMM15) + +#define REGS_SEGMENT \ + ENTRY(ES) \ + ENTRY(CS) \ + ENTRY(SS) \ + ENTRY(DS) \ + ENTRY(FS) \ + ENTRY(GS) + +#define REGS_DEBUG \ + ENTRY(DR0) \ + ENTRY(DR1) \ + ENTRY(DR2) \ + ENTRY(DR3) \ + ENTRY(DR4) \ + ENTRY(DR5) \ + ENTRY(DR6) \ + ENTRY(DR7) + +#define REGS_CONTROL \ + ENTRY(CR0) \ + ENTRY(CR1) \ + ENTRY(CR2) \ + ENTRY(CR3) \ + ENTRY(CR4) \ + ENTRY(CR5) \ + ENTRY(CR6) \ + ENTRY(CR7) \ + ENTRY(CR8) + +#define ALL_EA_BASES \ + EA_BASES_16BIT \ + EA_BASES_32BIT \ + EA_BASES_64BIT + +#define ALL_SIB_BASES \ + REGS_32BIT \ + REGS_64BIT + +#define ALL_REGS \ + REGS_8BIT \ + REGS_16BIT \ + REGS_32BIT \ + REGS_64BIT \ + REGS_MMX \ + REGS_XMM \ + REGS_YMM \ + REGS_SEGMENT \ + REGS_DEBUG \ + REGS_CONTROL \ + ENTRY(RIP) + +/* + * EABase - All possible values of the base field for effective-address + * computations, a.k.a. the Mod and R/M fields of the ModR/M byte. We + * distinguish between bases (EA_BASE_*) and registers that just happen to be + * referred to when Mod == 0b11 (EA_REG_*). + */ +typedef enum { + EA_BASE_NONE, +#define ENTRY(x) EA_BASE_##x, + ALL_EA_BASES +#undef ENTRY +#define ENTRY(x) EA_REG_##x, + ALL_REGS +#undef ENTRY + EA_max +} EABase; + +/* + * SIBIndex - All possible values of the SIB index field. + * Borrows entries from ALL_EA_BASES with the special case that + * sib is synonymous with NONE. + * Vector SIB: index can be XMM or YMM. + */ +typedef enum { + SIB_INDEX_NONE, +#define ENTRY(x) SIB_INDEX_##x, + ALL_EA_BASES + REGS_XMM + REGS_YMM +#undef ENTRY + SIB_INDEX_max +} SIBIndex; + +/* + * SIBBase - All possible values of the SIB base field. + */ +typedef enum { + SIB_BASE_NONE, +#define ENTRY(x) SIB_BASE_##x, + ALL_SIB_BASES +#undef ENTRY + SIB_BASE_max +} SIBBase; + +/* + * EADisplacement - Possible displacement types for effective-address + * computations. + */ +typedef enum { + EA_DISP_NONE, + EA_DISP_8, + EA_DISP_16, + EA_DISP_32 +} EADisplacement; + +/* + * Reg - All possible values of the reg field in the ModR/M byte. + */ +typedef enum { +#define ENTRY(x) MODRM_REG_##x, + ALL_REGS +#undef ENTRY + MODRM_REG_max +} Reg; + +/* + * SegmentOverride - All possible segment overrides. + */ +typedef enum { + SEG_OVERRIDE_NONE, + SEG_OVERRIDE_CS, + SEG_OVERRIDE_SS, + SEG_OVERRIDE_DS, + SEG_OVERRIDE_ES, + SEG_OVERRIDE_FS, + SEG_OVERRIDE_GS, + SEG_OVERRIDE_max +} SegmentOverride; + +/* + * VEXLeadingOpcodeByte - Possible values for the VEX.m-mmmm field + */ + +typedef enum { + VEX_LOB_0F = 0x1, + VEX_LOB_0F38 = 0x2, + VEX_LOB_0F3A = 0x3 +} VEXLeadingOpcodeByte; + +/* + * VEXPrefixCode - Possible values for the VEX.pp field + */ + +typedef enum { + VEX_PREFIX_NONE = 0x0, + VEX_PREFIX_66 = 0x1, + VEX_PREFIX_F3 = 0x2, + VEX_PREFIX_F2 = 0x3 +} VEXPrefixCode; + +typedef uint8_t BOOL; + +/* + * byteReader_t - Type for the byte reader that the consumer must provide to + * the decoder. Reads a single byte from the instruction's address space. + * @param arg - A baton that the consumer can associate with any internal + * state that it needs. + * @param byte - A pointer to a single byte in memory that should be set to + * contain the value at address. + * @param address - The address in the instruction's address space that should + * be read from. + * @return - -1 if the byte cannot be read for any reason; 0 otherwise. + */ +typedef int (*byteReader_t)(const void* arg, uint8_t* byte, uint64_t address); + +/* + * dlog_t - Type for the logging function that the consumer can provide to + * get debugging output from the decoder. + * @param arg - A baton that the consumer can associate with any internal + * state that it needs. + * @param log - A string that contains the message. Will be reused after + * the logger returns. + */ +typedef void (*dlog_t)(void* arg, const char *log); + +/* + * The x86 internal instruction, which is produced by the decoder. + */ +struct InternalInstruction { + /* Reader interface (C) */ + byteReader_t reader; + /* Opaque value passed to the reader */ + const void* readerArg; + /* The address of the next byte to read via the reader */ + uint64_t readerCursor; + + /* Logger interface (C) */ + dlog_t dlog; + /* Opaque value passed to the logger */ + void* dlogArg; + + /* General instruction information */ + + /* The mode to disassemble for (64-bit, protected, real) */ + DisassemblerMode mode; + /* The start of the instruction, usable with the reader */ + uint64_t startLocation; + /* The length of the instruction, in bytes */ + size_t length; + + /* Prefix state */ + + /* 1 if the prefix byte corresponding to the entry is present; 0 if not */ + uint8_t prefixPresent[0x100]; + /* contains the location (for use with the reader) of the prefix byte */ + uint64_t prefixLocations[0x100]; + /* The value of the VEX prefix, if present */ + uint8_t vexPrefix[3]; + /* The length of the VEX prefix (0 if not present) */ + uint8_t vexSize; + /* The value of the REX prefix, if present */ + uint8_t rexPrefix; + /* The location where a mandatory prefix would have to be (i.e., right before + the opcode, or right before the REX prefix if one is present) */ + uint64_t necessaryPrefixLocation; + /* The segment override type */ + SegmentOverride segmentOverride; + + /* Sizes of various critical pieces of data, in bytes */ + uint8_t registerSize; + uint8_t addressSize; + uint8_t displacementSize; + uint8_t immediateSize; + + /* Offsets from the start of the instruction to the pieces of data, which is + needed to find relocation entries for adding symbolic operands */ + uint8_t displacementOffset; + uint8_t immediateOffset; + + /* opcode state */ + + /* The value of the two-byte escape prefix (usually 0x0f) */ + uint8_t twoByteEscape; + /* The value of the three-byte escape prefix (usually 0x38 or 0x3a) */ + uint8_t threeByteEscape; + /* The last byte of the opcode, not counting any ModR/M extension */ + uint8_t opcode; + /* The ModR/M byte of the instruction, if it is an opcode extension */ + uint8_t modRMExtension; + + /* decode state */ + + /* The type of opcode, used for indexing into the array of decode tables */ + OpcodeType opcodeType; + /* The instruction ID, extracted from the decode table */ + uint16_t instructionID; + /* The specifier for the instruction, from the instruction info table */ + const struct InstructionSpecifier *spec; + + /* state for additional bytes, consumed during operand decode. Pattern: + consumed___ indicates that the byte was already consumed and does not + need to be consumed again */ + + /* The VEX.vvvv field, which contains a third register operand for some AVX + instructions */ + Reg vvvv; + + /* The ModR/M byte, which contains most register operands and some portion of + all memory operands */ + BOOL consumedModRM; + uint8_t modRM; + + /* The SIB byte, used for more complex 32- or 64-bit memory operands */ + BOOL consumedSIB; + uint8_t sib; + + /* The displacement, used for memory operands */ + BOOL consumedDisplacement; + int32_t displacement; + + /* Immediates. There can be two in some cases */ + uint8_t numImmediatesConsumed; + uint8_t numImmediatesTranslated; + uint64_t immediates[2]; + + /* A register or immediate operand encoded into the opcode */ + BOOL consumedOpcodeModifier; + uint8_t opcodeModifier; + Reg opcodeRegister; + + /* Portions of the ModR/M byte */ + + /* These fields determine the allowable values for the ModR/M fields, which + depend on operand and address widths */ + EABase eaBaseBase; + EABase eaRegBase; + Reg regBase; + + /* The Mod and R/M fields can encode a base for an effective address, or a + register. These are separated into two fields here */ + EABase eaBase; + EADisplacement eaDisplacement; + /* The reg field always encodes a register */ + Reg reg; + + /* SIB state */ + SIBIndex sibIndex; + uint8_t sibScale; + SIBBase sibBase; + + const struct OperandSpecifier *operands; +}; + +/* decodeInstruction - Decode one instruction and store the decoding results in + * a buffer provided by the consumer. + * @param insn - The buffer to store the instruction in. Allocated by the + * consumer. + * @param reader - The byteReader_t for the bytes to be read. + * @param readerArg - An argument to pass to the reader for storing context + * specific to the consumer. May be NULL. + * @param logger - The dlog_t to be used in printing status messages from the + * disassembler. May be NULL. + * @param loggerArg - An argument to pass to the logger for storing context + * specific to the logger. May be NULL. + * @param startLoc - The address (in the reader's address space) of the first + * byte in the instruction. + * @param mode - The mode (16-bit, 32-bit, 64-bit) to decode in. + * @return - Nonzero if there was an error during decode, 0 otherwise. + */ +int decodeInstruction(struct InternalInstruction* insn, + byteReader_t reader, + const void* readerArg, + dlog_t logger, + void* loggerArg, + const void* miiArg, + uint64_t startLoc, + DisassemblerMode mode); + +/* x86DisassemblerDebug - C-accessible function for printing a message to + * debugs() + * @param file - The name of the file printing the debug message. + * @param line - The line number that printed the debug message. + * @param s - The message to print. + */ + +void x86DisassemblerDebug(const char *file, + unsigned line, + const char *s); + +const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h new file mode 100644 index 0000000..23dfe4b --- /dev/null +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h @@ -0,0 +1,398 @@ +/*===-- X86DisassemblerDecoderCommon.h - Disassembler decoder -----*- C -*-===* + * + * The LLVM Compiler Infrastructure + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + * + *===----------------------------------------------------------------------===* + * + * This file is part of the X86 Disassembler. + * It contains common definitions used by both the disassembler and the table + * generator. + * Documentation for the disassembler can be found in X86Disassembler.h. + * + *===----------------------------------------------------------------------===*/ + +/* + * This header file provides those definitions that need to be shared between + * the decoder and the table generator in a C-friendly manner. + */ + +#ifndef X86DISASSEMBLERDECODERCOMMON_H +#define X86DISASSEMBLERDECODERCOMMON_H + +#include "llvm/Support/DataTypes.h" + +#define INSTRUCTIONS_SYM x86DisassemblerInstrSpecifiers +#define CONTEXTS_SYM x86DisassemblerContexts +#define ONEBYTE_SYM x86DisassemblerOneByteOpcodes +#define TWOBYTE_SYM x86DisassemblerTwoByteOpcodes +#define THREEBYTE38_SYM x86DisassemblerThreeByte38Opcodes +#define THREEBYTE3A_SYM x86DisassemblerThreeByte3AOpcodes +#define THREEBYTEA6_SYM x86DisassemblerThreeByteA6Opcodes +#define THREEBYTEA7_SYM x86DisassemblerThreeByteA7Opcodes + +#define INSTRUCTIONS_STR "x86DisassemblerInstrSpecifiers" +#define CONTEXTS_STR "x86DisassemblerContexts" +#define ONEBYTE_STR "x86DisassemblerOneByteOpcodes" +#define TWOBYTE_STR "x86DisassemblerTwoByteOpcodes" +#define THREEBYTE38_STR "x86DisassemblerThreeByte38Opcodes" +#define THREEBYTE3A_STR "x86DisassemblerThreeByte3AOpcodes" +#define THREEBYTEA6_STR "x86DisassemblerThreeByteA6Opcodes" +#define THREEBYTEA7_STR "x86DisassemblerThreeByteA7Opcodes" + +/* + * Attributes of an instruction that must be known before the opcode can be + * processed correctly. Most of these indicate the presence of particular + * prefixes, but ATTR_64BIT is simply an attribute of the decoding context. + */ +#define ATTRIBUTE_BITS \ + ENUM_ENTRY(ATTR_NONE, 0x00) \ + ENUM_ENTRY(ATTR_64BIT, 0x01) \ + ENUM_ENTRY(ATTR_XS, 0x02) \ + ENUM_ENTRY(ATTR_XD, 0x04) \ + ENUM_ENTRY(ATTR_REXW, 0x08) \ + ENUM_ENTRY(ATTR_OPSIZE, 0x10) \ + ENUM_ENTRY(ATTR_ADSIZE, 0x20) \ + ENUM_ENTRY(ATTR_VEX, 0x40) \ + ENUM_ENTRY(ATTR_VEXL, 0x80) + +#define ENUM_ENTRY(n, v) n = v, +enum attributeBits { + ATTRIBUTE_BITS + ATTR_max +}; +#undef ENUM_ENTRY + +/* + * Combinations of the above attributes that are relevant to instruction + * decode. Although other combinations are possible, they can be reduced to + * these without affecting the ultimately decoded instruction. + */ + +/* Class name Rank Rationale for rank assignment */ +#define INSTRUCTION_CONTEXTS \ + ENUM_ENTRY(IC, 0, "says nothing about the instruction") \ + ENUM_ENTRY(IC_64BIT, 1, "says the instruction applies in " \ + "64-bit mode but no more") \ + ENUM_ENTRY(IC_OPSIZE, 3, "requires an OPSIZE prefix, so " \ + "operands change width") \ + ENUM_ENTRY(IC_ADSIZE, 3, "requires an ADSIZE prefix, so " \ + "operands change width") \ + ENUM_ENTRY(IC_XD, 2, "may say something about the opcode " \ + "but not the operands") \ + ENUM_ENTRY(IC_XS, 2, "may say something about the opcode " \ + "but not the operands") \ + ENUM_ENTRY(IC_XD_OPSIZE, 3, "requires an OPSIZE prefix, so " \ + "operands change width") \ + ENUM_ENTRY(IC_XS_OPSIZE, 3, "requires an OPSIZE prefix, so " \ + "operands change width") \ + ENUM_ENTRY(IC_64BIT_REXW, 4, "requires a REX.W prefix, so operands "\ + "change width; overrides IC_OPSIZE") \ + ENUM_ENTRY(IC_64BIT_OPSIZE, 3, "Just as meaningful as IC_OPSIZE") \ + ENUM_ENTRY(IC_64BIT_ADSIZE, 3, "Just as meaningful as IC_ADSIZE") \ + ENUM_ENTRY(IC_64BIT_XD, 5, "XD instructions are SSE; REX.W is " \ + "secondary") \ + ENUM_ENTRY(IC_64BIT_XS, 5, "Just as meaningful as IC_64BIT_XD") \ + ENUM_ENTRY(IC_64BIT_XD_OPSIZE, 3, "Just as meaningful as IC_XD_OPSIZE") \ + ENUM_ENTRY(IC_64BIT_XS_OPSIZE, 3, "Just as meaningful as IC_XS_OPSIZE") \ + ENUM_ENTRY(IC_64BIT_REXW_XS, 6, "OPSIZE could mean a different " \ + "opcode") \ + ENUM_ENTRY(IC_64BIT_REXW_XD, 6, "Just as meaningful as " \ + "IC_64BIT_REXW_XS") \ + ENUM_ENTRY(IC_64BIT_REXW_OPSIZE, 7, "The Dynamic Duo! Prefer over all " \ + "else because this changes most " \ + "operands' meaning") \ + ENUM_ENTRY(IC_VEX, 1, "requires a VEX prefix") \ + ENUM_ENTRY(IC_VEX_XS, 2, "requires VEX and the XS prefix") \ + ENUM_ENTRY(IC_VEX_XD, 2, "requires VEX and the XD prefix") \ + ENUM_ENTRY(IC_VEX_OPSIZE, 2, "requires VEX and the OpSize prefix") \ + ENUM_ENTRY(IC_VEX_W, 3, "requires VEX and the W prefix") \ + ENUM_ENTRY(IC_VEX_W_XS, 4, "requires VEX, W, and XS prefix") \ + ENUM_ENTRY(IC_VEX_W_XD, 4, "requires VEX, W, and XD prefix") \ + ENUM_ENTRY(IC_VEX_W_OPSIZE, 4, "requires VEX, W, and OpSize") \ + ENUM_ENTRY(IC_VEX_L, 3, "requires VEX and the L prefix") \ + ENUM_ENTRY(IC_VEX_L_XS, 4, "requires VEX and the L and XS prefix")\ + ENUM_ENTRY(IC_VEX_L_XD, 4, "requires VEX and the L and XD prefix")\ + ENUM_ENTRY(IC_VEX_L_OPSIZE, 4, "requires VEX, L, and OpSize") \ + ENUM_ENTRY(IC_VEX_L_W_OPSIZE, 5, "requires VEX, L, W and OpSize") + + +#define ENUM_ENTRY(n, r, d) n, +typedef enum { + INSTRUCTION_CONTEXTS + IC_max +} InstructionContext; +#undef ENUM_ENTRY + +/* + * Opcode types, which determine which decode table to use, both in the Intel + * manual and also for the decoder. + */ +typedef enum { + ONEBYTE = 0, + TWOBYTE = 1, + THREEBYTE_38 = 2, + THREEBYTE_3A = 3, + THREEBYTE_A6 = 4, + THREEBYTE_A7 = 5 +} OpcodeType; + +/* + * The following structs are used for the hierarchical decode table. After + * determining the instruction's class (i.e., which IC_* constant applies to + * it), the decoder reads the opcode. Some instructions require specific + * values of the ModR/M byte, so the ModR/M byte indexes into the final table. + * + * If a ModR/M byte is not required, "required" is left unset, and the values + * for each instructionID are identical. + */ + +typedef uint16_t InstrUID; + +/* + * ModRMDecisionType - describes the type of ModR/M decision, allowing the + * consumer to determine the number of entries in it. + * + * MODRM_ONEENTRY - No matter what the value of the ModR/M byte is, the decoded + * instruction is the same. + * MODRM_SPLITRM - If the ModR/M byte is between 0x00 and 0xbf, the opcode + * corresponds to one instruction; otherwise, it corresponds to + * a different instruction. + * MODRM_SPLITMISC- If the ModR/M byte is between 0x00 and 0xbf, ModR/M byte + * divided by 8 is used to select instruction; otherwise, each + * value of the ModR/M byte could correspond to a different + * instruction. + * MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This + corresponds to instructions that use reg field as opcode + * MODRM_FULL - Potentially, each value of the ModR/M byte could correspond + * to a different instruction. + */ + +#define MODRMTYPES \ + ENUM_ENTRY(MODRM_ONEENTRY) \ + ENUM_ENTRY(MODRM_SPLITRM) \ + ENUM_ENTRY(MODRM_SPLITMISC) \ + ENUM_ENTRY(MODRM_SPLITREG) \ + ENUM_ENTRY(MODRM_FULL) + +#define ENUM_ENTRY(n) n, +typedef enum { + MODRMTYPES + MODRM_max +} ModRMDecisionType; +#undef ENUM_ENTRY + +/* + * ModRMDecision - Specifies whether a ModR/M byte is needed and (if so) which + * instruction each possible value of the ModR/M byte corresponds to. Once + * this information is known, we have narrowed down to a single instruction. + */ +struct ModRMDecision { + uint8_t modrm_type; + + /* The macro below must be defined wherever this file is included. */ + INSTRUCTION_IDS +}; + +/* + * OpcodeDecision - Specifies which set of ModR/M->instruction tables to look at + * given a particular opcode. + */ +struct OpcodeDecision { + struct ModRMDecision modRMDecisions[256]; +}; + +/* + * ContextDecision - Specifies which opcode->instruction tables to look at given + * a particular context (set of attributes). Since there are many possible + * contexts, the decoder first uses CONTEXTS_SYM to determine which context + * applies given a specific set of attributes. Hence there are only IC_max + * entries in this table, rather than 2^(ATTR_max). + */ +struct ContextDecision { + struct OpcodeDecision opcodeDecisions[IC_max]; +}; + +/* + * Physical encodings of instruction operands. + */ + +#define ENCODINGS \ + ENUM_ENTRY(ENCODING_NONE, "") \ + ENUM_ENTRY(ENCODING_REG, "Register operand in ModR/M byte.") \ + ENUM_ENTRY(ENCODING_RM, "R/M operand in ModR/M byte.") \ + ENUM_ENTRY(ENCODING_VVVV, "Register operand in VEX.vvvv byte.") \ + ENUM_ENTRY(ENCODING_CB, "1-byte code offset (possible new CS value)") \ + ENUM_ENTRY(ENCODING_CW, "2-byte") \ + ENUM_ENTRY(ENCODING_CD, "4-byte") \ + ENUM_ENTRY(ENCODING_CP, "6-byte") \ + ENUM_ENTRY(ENCODING_CO, "8-byte") \ + ENUM_ENTRY(ENCODING_CT, "10-byte") \ + ENUM_ENTRY(ENCODING_IB, "1-byte immediate") \ + ENUM_ENTRY(ENCODING_IW, "2-byte") \ + ENUM_ENTRY(ENCODING_ID, "4-byte") \ + ENUM_ENTRY(ENCODING_IO, "8-byte") \ + ENUM_ENTRY(ENCODING_RB, "(AL..DIL, R8L..R15L) Register code added to " \ + "the opcode byte") \ + ENUM_ENTRY(ENCODING_RW, "(AX..DI, R8W..R15W)") \ + ENUM_ENTRY(ENCODING_RD, "(EAX..EDI, R8D..R15D)") \ + ENUM_ENTRY(ENCODING_RO, "(RAX..RDI, R8..R15)") \ + ENUM_ENTRY(ENCODING_I, "Position on floating-point stack added to the " \ + "opcode byte") \ + \ + ENUM_ENTRY(ENCODING_Iv, "Immediate of operand size") \ + ENUM_ENTRY(ENCODING_Ia, "Immediate of address size") \ + ENUM_ENTRY(ENCODING_Rv, "Register code of operand size added to the " \ + "opcode byte") \ + ENUM_ENTRY(ENCODING_DUP, "Duplicate of another operand; ID is encoded " \ + "in type") + +#define ENUM_ENTRY(n, d) n, + typedef enum { + ENCODINGS + ENCODING_max + } OperandEncoding; +#undef ENUM_ENTRY + +/* + * Semantic interpretations of instruction operands. + */ + +#define TYPES \ + ENUM_ENTRY(TYPE_NONE, "") \ + ENUM_ENTRY(TYPE_REL8, "1-byte immediate address") \ + ENUM_ENTRY(TYPE_REL16, "2-byte") \ + ENUM_ENTRY(TYPE_REL32, "4-byte") \ + ENUM_ENTRY(TYPE_REL64, "8-byte") \ + ENUM_ENTRY(TYPE_PTR1616, "2+2-byte segment+offset address") \ + ENUM_ENTRY(TYPE_PTR1632, "2+4-byte") \ + ENUM_ENTRY(TYPE_PTR1664, "2+8-byte") \ + ENUM_ENTRY(TYPE_R8, "1-byte register operand") \ + ENUM_ENTRY(TYPE_R16, "2-byte") \ + ENUM_ENTRY(TYPE_R32, "4-byte") \ + ENUM_ENTRY(TYPE_R64, "8-byte") \ + ENUM_ENTRY(TYPE_IMM8, "1-byte immediate operand") \ + ENUM_ENTRY(TYPE_IMM16, "2-byte") \ + ENUM_ENTRY(TYPE_IMM32, "4-byte") \ + ENUM_ENTRY(TYPE_IMM64, "8-byte") \ + ENUM_ENTRY(TYPE_IMM3, "1-byte immediate operand between 0 and 7") \ + ENUM_ENTRY(TYPE_IMM5, "1-byte immediate operand between 0 and 31") \ + ENUM_ENTRY(TYPE_RM8, "1-byte register or memory operand") \ + ENUM_ENTRY(TYPE_RM16, "2-byte") \ + ENUM_ENTRY(TYPE_RM32, "4-byte") \ + ENUM_ENTRY(TYPE_RM64, "8-byte") \ + ENUM_ENTRY(TYPE_M, "Memory operand") \ + ENUM_ENTRY(TYPE_M8, "1-byte") \ + ENUM_ENTRY(TYPE_M16, "2-byte") \ + ENUM_ENTRY(TYPE_M32, "4-byte") \ + ENUM_ENTRY(TYPE_M64, "8-byte") \ + ENUM_ENTRY(TYPE_LEA, "Effective address") \ + ENUM_ENTRY(TYPE_M128, "16-byte (SSE/SSE2)") \ + ENUM_ENTRY(TYPE_M256, "256-byte (AVX)") \ + ENUM_ENTRY(TYPE_M1616, "2+2-byte segment+offset address") \ + ENUM_ENTRY(TYPE_M1632, "2+4-byte") \ + ENUM_ENTRY(TYPE_M1664, "2+8-byte") \ + ENUM_ENTRY(TYPE_M16_32, "2+4-byte two-part memory operand (LIDT, LGDT)") \ + ENUM_ENTRY(TYPE_M16_16, "2+2-byte (BOUND)") \ + ENUM_ENTRY(TYPE_M32_32, "4+4-byte (BOUND)") \ + ENUM_ENTRY(TYPE_M16_64, "2+8-byte (LIDT, LGDT)") \ + ENUM_ENTRY(TYPE_MOFFS8, "1-byte memory offset (relative to segment " \ + "base)") \ + ENUM_ENTRY(TYPE_MOFFS16, "2-byte") \ + ENUM_ENTRY(TYPE_MOFFS32, "4-byte") \ + ENUM_ENTRY(TYPE_MOFFS64, "8-byte") \ + ENUM_ENTRY(TYPE_SREG, "Byte with single bit set: 0 = ES, 1 = CS, " \ + "2 = SS, 3 = DS, 4 = FS, 5 = GS") \ + ENUM_ENTRY(TYPE_M32FP, "32-bit IEE754 memory floating-point operand") \ + ENUM_ENTRY(TYPE_M64FP, "64-bit") \ + ENUM_ENTRY(TYPE_M80FP, "80-bit extended") \ + ENUM_ENTRY(TYPE_M16INT, "2-byte memory integer operand for use in " \ + "floating-point instructions") \ + ENUM_ENTRY(TYPE_M32INT, "4-byte") \ + ENUM_ENTRY(TYPE_M64INT, "8-byte") \ + ENUM_ENTRY(TYPE_ST, "Position on the floating-point stack") \ + ENUM_ENTRY(TYPE_MM, "MMX register operand") \ + ENUM_ENTRY(TYPE_MM32, "4-byte MMX register or memory operand") \ + ENUM_ENTRY(TYPE_MM64, "8-byte") \ + ENUM_ENTRY(TYPE_XMM, "XMM register operand") \ + ENUM_ENTRY(TYPE_XMM32, "4-byte XMM register or memory operand") \ + ENUM_ENTRY(TYPE_XMM64, "8-byte") \ + ENUM_ENTRY(TYPE_XMM128, "16-byte") \ + ENUM_ENTRY(TYPE_XMM256, "32-byte") \ + ENUM_ENTRY(TYPE_XMM0, "Implicit use of XMM0") \ + ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \ + ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand") \ + ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand") \ + \ + ENUM_ENTRY(TYPE_Mv, "Memory operand of operand size") \ + ENUM_ENTRY(TYPE_Rv, "Register operand of operand size") \ + ENUM_ENTRY(TYPE_IMMv, "Immediate operand of operand size") \ + ENUM_ENTRY(TYPE_RELv, "Immediate address of operand size") \ + ENUM_ENTRY(TYPE_DUP0, "Duplicate of operand 0") \ + ENUM_ENTRY(TYPE_DUP1, "operand 1") \ + ENUM_ENTRY(TYPE_DUP2, "operand 2") \ + ENUM_ENTRY(TYPE_DUP3, "operand 3") \ + ENUM_ENTRY(TYPE_DUP4, "operand 4") \ + ENUM_ENTRY(TYPE_M512, "512-bit FPU/MMX/XMM/MXCSR state") + +#define ENUM_ENTRY(n, d) n, +typedef enum { + TYPES + TYPE_max +} OperandType; +#undef ENUM_ENTRY + +/* + * OperandSpecifier - The specification for how to extract and interpret one + * operand. + */ +struct OperandSpecifier { + uint8_t encoding; + uint8_t type; +}; + +/* + * Indicates where the opcode modifier (if any) is to be found. Extended + * opcodes with AddRegFrm have the opcode modifier in the ModR/M byte. + */ + +#define MODIFIER_TYPES \ + ENUM_ENTRY(MODIFIER_NONE) \ + ENUM_ENTRY(MODIFIER_OPCODE) \ + ENUM_ENTRY(MODIFIER_MODRM) + +#define ENUM_ENTRY(n) n, +typedef enum { + MODIFIER_TYPES + MODIFIER_max +} ModifierType; +#undef ENUM_ENTRY + +#define X86_MAX_OPERANDS 5 + +/* + * The specification for how to extract and interpret a full instruction and + * its operands. + */ +struct InstructionSpecifier { + uint8_t modifierType; + uint8_t modifierBase; + + /* The macro below must be defined wherever this file is included. */ + INSTRUCTION_SPECIFIER_FIELDS +}; + +/* + * Decoding mode for the Intel disassembler. 16-bit, 32-bit, and 64-bit mode + * are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode, + * respectively. + */ +typedef enum { + MODE_16BIT, + MODE_32BIT, + MODE_64BIT +} DisassemblerMode; + +#endif diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp new file mode 100644 index 0000000..e357710 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -0,0 +1,218 @@ +//===-- X86ATTInstPrinter.cpp - AT&T assembly instruction printing --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file includes code for rendering MCInst instances as AT&T-style +// assembly. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "asm-printer" +#include "X86ATTInstPrinter.h" +#include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "X86InstComments.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/FormattedStream.h" +#include <map> +using namespace llvm; + +// Include the auto-generated portion of the assembly writer. +#define PRINT_ALIAS_INSTR +#include "X86GenAsmWriter.inc" + +void X86ATTInstPrinter::printRegName(raw_ostream &OS, + unsigned RegNo) const { + OS << markup("<reg:") + << '%' << getRegisterName(RegNo) + << markup(">"); +} + +void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, + StringRef Annot) { + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + uint64_t TSFlags = Desc.TSFlags; + + if (TSFlags & X86II::LOCK) + OS << "\tlock\n"; + + // Try to print any aliases first. + if (!printAliasInstr(MI, OS)) + printInstruction(MI, OS); + + // Next always print the annotation. + printAnnotation(OS, Annot); + + // If verbose assembly is enabled, we can print some informative comments. + if (CommentStream) + EmitAnyX86InstComments(MI, *CommentStream, getRegisterName); +} + +void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm() & 0xf; + switch (Imm) { + default: llvm_unreachable("Invalid ssecc argument!"); + case 0: O << "eq"; break; + case 1: O << "lt"; break; + case 2: O << "le"; break; + case 3: O << "unord"; break; + case 4: O << "neq"; break; + case 5: O << "nlt"; break; + case 6: O << "nle"; break; + case 7: O << "ord"; break; + case 8: O << "eq_uq"; break; + case 9: O << "nge"; break; + case 0xa: O << "ngt"; break; + case 0xb: O << "false"; break; + case 0xc: O << "neq_oq"; break; + case 0xd: O << "ge"; break; + case 0xe: O << "gt"; break; + case 0xf: O << "true"; break; + } +} + +void X86ATTInstPrinter::printAVXCC(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm() & 0x1f; + switch (Imm) { + default: llvm_unreachable("Invalid avxcc argument!"); + case 0: O << "eq"; break; + case 1: O << "lt"; break; + case 2: O << "le"; break; + case 3: O << "unord"; break; + case 4: O << "neq"; break; + case 5: O << "nlt"; break; + case 6: O << "nle"; break; + case 7: O << "ord"; break; + case 8: O << "eq_uq"; break; + case 9: O << "nge"; break; + case 0xa: O << "ngt"; break; + case 0xb: O << "false"; break; + case 0xc: O << "neq_oq"; break; + case 0xd: O << "ge"; break; + case 0xe: O << "gt"; break; + case 0xf: O << "true"; break; + case 0x10: O << "eq_os"; break; + case 0x11: O << "lt_oq"; break; + case 0x12: O << "le_oq"; break; + case 0x13: O << "unord_s"; break; + case 0x14: O << "neq_us"; break; + case 0x15: O << "nlt_uq"; break; + case 0x16: O << "nle_uq"; break; + case 0x17: O << "ord_s"; break; + case 0x18: O << "eq_us"; break; + case 0x19: O << "nge_uq"; break; + case 0x1a: O << "ngt_uq"; break; + case 0x1b: O << "false_os"; break; + case 0x1c: O << "neq_os"; break; + case 0x1d: O << "ge_oq"; break; + case 0x1e: O << "gt_oq"; break; + case 0x1f: O << "true_us"; break; + } +} + +/// printPCRelImm - This is used to print an immediate value that ends up +/// being encoded as a pc-relative value (e.g. for jumps and calls). These +/// print slightly differently than normal immediates. For example, a $ is not +/// emitted. +void X86ATTInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isImm()) + O << formatImm(Op.getImm()); + else { + assert(Op.isExpr() && "unknown pcrel immediate operand"); + // If a symbolic branch target was added as a constant expression then print + // that address in hex. + const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr()); + int64_t Address; + if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) { + O << "0x"; + O.write_hex(Address); + } + else { + // Otherwise, just print the expression. + O << *Op.getExpr(); + } + } +} + +void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + printRegName(O, Op.getReg()); + } else if (Op.isImm()) { + // Print X86 immediates as signed values. + O << markup("<imm:") + << '$' << formatImm((int64_t)Op.getImm()) + << markup(">"); + + if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256)) + *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Op.getImm()); + + } else { + assert(Op.isExpr() && "unknown operand kind in printOperand"); + O << markup("<imm:") + << '$' << *Op.getExpr() + << markup(">"); + } +} + +void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &BaseReg = MI->getOperand(Op); + const MCOperand &IndexReg = MI->getOperand(Op+2); + const MCOperand &DispSpec = MI->getOperand(Op+3); + const MCOperand &SegReg = MI->getOperand(Op+4); + + O << markup("<mem:"); + + // If this has a segment register, print it. + if (SegReg.getReg()) { + printOperand(MI, Op+4, O); + O << ':'; + } + + if (DispSpec.isImm()) { + int64_t DispVal = DispSpec.getImm(); + if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) + O << formatImm(DispVal); + } else { + assert(DispSpec.isExpr() && "non-immediate displacement for LEA?"); + O << *DispSpec.getExpr(); + } + + if (IndexReg.getReg() || BaseReg.getReg()) { + O << '('; + if (BaseReg.getReg()) + printOperand(MI, Op, O); + + if (IndexReg.getReg()) { + O << ','; + printOperand(MI, Op+2, O); + unsigned ScaleVal = MI->getOperand(Op+1).getImm(); + if (ScaleVal != 1) { + O << ',' + << markup("<imm:") + << ScaleVal // never printed in hex. + << markup(">"); + } + } + O << ')'; + } + + O << markup(">"); +} diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h new file mode 100644 index 0000000..8e09183 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h @@ -0,0 +1,87 @@ +//==- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class prints an X86 MCInst to AT&T style .s file syntax. +// +//===----------------------------------------------------------------------===// + +#ifndef X86_ATT_INST_PRINTER_H +#define X86_ATT_INST_PRINTER_H + +#include "llvm/MC/MCInstPrinter.h" + +namespace llvm { + +class MCOperand; + +class X86ATTInstPrinter : public MCInstPrinter { +public: + X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + + virtual void printRegName(raw_ostream &OS, unsigned RegNo) const; + virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot); + + // Autogenerated by tblgen, returns true if we successfully printed an + // alias. + bool printAliasInstr(const MCInst *MI, raw_ostream &OS); + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &OS); + static const char *getRegisterName(unsigned RegNo); + + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS); + void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &OS); + + void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + + void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp new file mode 100644 index 0000000..0f6eeb1 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -0,0 +1,511 @@ +//===-- X86InstComments.cpp - Generate verbose-asm comments for instrs ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This defines functionality used to emit comments about X86 instructions to +// an output stream for -fverbose-asm. +// +//===----------------------------------------------------------------------===// + +#include "X86InstComments.h" +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "Utils/X86ShuffleDecode.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Top Level Entrypoint +//===----------------------------------------------------------------------===// + +/// EmitAnyX86InstComments - This function decodes x86 instructions and prints +/// newline terminated strings to the specified string if desired. This +/// information is shown in disassembly dumps when verbose assembly is enabled. +void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, + const char *(*getRegName)(unsigned)) { + // If this is a shuffle operation, the switch should fill in this state. + SmallVector<int, 8> ShuffleMask; + const char *DestName = 0, *Src1Name = 0, *Src2Name = 0; + + switch (MI->getOpcode()) { + case X86::INSERTPSrr: + case X86::VINSERTPSrr: + DestName = getRegName(MI->getOperand(0).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + Src2Name = getRegName(MI->getOperand(2).getReg()); + DecodeINSERTPSMask(MI->getOperand(3).getImm(), ShuffleMask); + break; + + case X86::MOVLHPSrr: + case X86::VMOVLHPSrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVLHPSMask(2, ShuffleMask); + break; + + case X86::MOVHLPSrr: + case X86::VMOVHLPSrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVHLPSMask(2, ShuffleMask); + break; + + case X86::PALIGNR128rr: + case X86::VPALIGNR128rr: + Src1Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::PALIGNR128rm: + case X86::VPALIGNR128rm: + Src2Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodePALIGNRMask(MVT::v16i8, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + case X86::VPALIGNR256rr: + Src1Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPALIGNR256rm: + Src2Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodePALIGNRMask(MVT::v32i8, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + + case X86::PSHUFDri: + case X86::VPSHUFDri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::PSHUFDmi: + case X86::VPSHUFDmi: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodePSHUFMask(MVT::v4i32, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + case X86::VPSHUFDYri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VPSHUFDYmi: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodePSHUFMask(MVT::v8i32, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + + + case X86::PSHUFHWri: + case X86::VPSHUFHWri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::PSHUFHWmi: + case X86::VPSHUFHWmi: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodePSHUFHWMask(MVT::v8i16, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + case X86::VPSHUFHWYri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VPSHUFHWYmi: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodePSHUFHWMask(MVT::v16i16, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + case X86::PSHUFLWri: + case X86::VPSHUFLWri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::PSHUFLWmi: + case X86::VPSHUFLWmi: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodePSHUFLWMask(MVT::v8i16, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + case X86::VPSHUFLWYri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VPSHUFLWYmi: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodePSHUFLWMask(MVT::v16i16, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + + case X86::PUNPCKHBWrr: + case X86::VPUNPCKHBWrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::PUNPCKHBWrm: + case X86::VPUNPCKHBWrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v16i8, ShuffleMask); + break; + case X86::VPUNPCKHBWYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKHBWYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v32i8, ShuffleMask); + break; + case X86::PUNPCKHWDrr: + case X86::VPUNPCKHWDrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::PUNPCKHWDrm: + case X86::VPUNPCKHWDrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v8i16, ShuffleMask); + break; + case X86::VPUNPCKHWDYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKHWDYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v16i16, ShuffleMask); + break; + case X86::PUNPCKHDQrr: + case X86::VPUNPCKHDQrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::PUNPCKHDQrm: + case X86::VPUNPCKHDQrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v4i32, ShuffleMask); + break; + case X86::VPUNPCKHDQYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKHDQYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v8i32, ShuffleMask); + break; + case X86::PUNPCKHQDQrr: + case X86::VPUNPCKHQDQrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::PUNPCKHQDQrm: + case X86::VPUNPCKHQDQrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v2i64, ShuffleMask); + break; + case X86::VPUNPCKHQDQYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKHQDQYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v4i64, ShuffleMask); + break; + + case X86::PUNPCKLBWrr: + case X86::VPUNPCKLBWrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::PUNPCKLBWrm: + case X86::VPUNPCKLBWrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v16i8, ShuffleMask); + break; + case X86::VPUNPCKLBWYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKLBWYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v32i8, ShuffleMask); + break; + case X86::PUNPCKLWDrr: + case X86::VPUNPCKLWDrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::PUNPCKLWDrm: + case X86::VPUNPCKLWDrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v8i16, ShuffleMask); + break; + case X86::VPUNPCKLWDYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKLWDYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v16i16, ShuffleMask); + break; + case X86::PUNPCKLDQrr: + case X86::VPUNPCKLDQrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::PUNPCKLDQrm: + case X86::VPUNPCKLDQrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v4i32, ShuffleMask); + break; + case X86::VPUNPCKLDQYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKLDQYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v8i32, ShuffleMask); + break; + case X86::PUNPCKLQDQrr: + case X86::VPUNPCKLQDQrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::PUNPCKLQDQrm: + case X86::VPUNPCKLQDQrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v2i64, ShuffleMask); + break; + case X86::VPUNPCKLQDQYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKLQDQYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v4i64, ShuffleMask); + break; + + case X86::SHUFPDrri: + case X86::VSHUFPDrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::SHUFPDrmi: + case X86::VSHUFPDrmi: + DecodeSHUFPMask(MVT::v2f64, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VSHUFPDYrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VSHUFPDYrmi: + DecodeSHUFPMask(MVT::v4f64, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::SHUFPSrri: + case X86::VSHUFPSrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::SHUFPSrmi: + case X86::VSHUFPSrmi: + DecodeSHUFPMask(MVT::v4f32, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VSHUFPSYrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VSHUFPSYrmi: + DecodeSHUFPMask(MVT::v8f32, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::UNPCKLPDrr: + case X86::VUNPCKLPDrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::UNPCKLPDrm: + case X86::VUNPCKLPDrm: + DecodeUNPCKLMask(MVT::v2f64, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VUNPCKLPDYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKLPDYrm: + DecodeUNPCKLMask(MVT::v4f64, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::UNPCKLPSrr: + case X86::VUNPCKLPSrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::UNPCKLPSrm: + case X86::VUNPCKLPSrm: + DecodeUNPCKLMask(MVT::v4f32, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VUNPCKLPSYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKLPSYrm: + DecodeUNPCKLMask(MVT::v8f32, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::UNPCKHPDrr: + case X86::VUNPCKHPDrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::UNPCKHPDrm: + case X86::VUNPCKHPDrm: + DecodeUNPCKHMask(MVT::v2f64, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VUNPCKHPDYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKHPDYrm: + DecodeUNPCKHMask(MVT::v4f64, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::UNPCKHPSrr: + case X86::VUNPCKHPSrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::UNPCKHPSrm: + case X86::VUNPCKHPSrm: + DecodeUNPCKHMask(MVT::v4f32, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VUNPCKHPSYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKHPSYrm: + DecodeUNPCKHMask(MVT::v8f32, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VPERMILPSri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VPERMILPSmi: + DecodePSHUFMask(MVT::v4f32, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VPERMILPSYri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VPERMILPSYmi: + DecodePSHUFMask(MVT::v8f32, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VPERMILPDri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VPERMILPDmi: + DecodePSHUFMask(MVT::v2f64, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VPERMILPDYri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VPERMILPDYmi: + DecodePSHUFMask(MVT::v4f64, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VPERM2F128rr: + case X86::VPERM2I128rr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPERM2F128rm: + case X86::VPERM2I128rm: + // For instruction comments purpose, assume the 256-bit vector is v4i64. + DecodeVPERM2X128Mask(MVT::v4i64, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VPERMQYri: + case X86::VPERMPDYri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VPERMQYmi: + case X86::VPERMPDYmi: + DecodeVPERMMask(MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + } + + + // If this was a shuffle operation, print the shuffle mask. + if (!ShuffleMask.empty()) { + if (DestName == 0) DestName = Src1Name; + OS << (DestName ? DestName : "mem") << " = "; + + // If the two sources are the same, canonicalize the input elements to be + // from the first src so that we get larger element spans. + if (Src1Name == Src2Name) { + for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { + if ((int)ShuffleMask[i] >= 0 && // Not sentinel. + ShuffleMask[i] >= (int)e) // From second mask. + ShuffleMask[i] -= e; + } + } + + // The shuffle mask specifies which elements of the src1/src2 fill in the + // destination, with a few sentinel values. Loop through and print them + // out. + for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { + if (i != 0) + OS << ','; + if (ShuffleMask[i] == SM_SentinelZero) { + OS << "zero"; + continue; + } + + // Otherwise, it must come from src1 or src2. Print the span of elements + // that comes from this src. + bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size(); + const char *SrcName = isSrc1 ? Src1Name : Src2Name; + OS << (SrcName ? SrcName : "mem") << '['; + bool IsFirst = true; + while (i != e && + (int)ShuffleMask[i] >= 0 && + (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) { + if (!IsFirst) + OS << ','; + else + IsFirst = false; + OS << ShuffleMask[i] % ShuffleMask.size(); + ++i; + } + OS << ']'; + --i; // For loop increments element #. + } + //MI->print(OS, 0); + OS << "\n"; + } + +} diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h new file mode 100644 index 0000000..13fdf9a --- /dev/null +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h @@ -0,0 +1,25 @@ +//=- X86InstComments.h - Generate verbose-asm comments for instrs -*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This defines functionality used to emit comments about X86 instructions to +// an output stream for -fverbose-asm. +// +//===----------------------------------------------------------------------===// + +#ifndef X86_INST_COMMENTS_H +#define X86_INST_COMMENTS_H + +namespace llvm { + class MCInst; + class raw_ostream; + void EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, + const char *(*getRegName)(unsigned)); +} + +#endif diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp new file mode 100644 index 0000000..141f4a4 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp @@ -0,0 +1,208 @@ +//===-- X86IntelInstPrinter.cpp - Intel assembly instruction printing -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file includes code for rendering MCInst instances as Intel-style +// assembly. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "asm-printer" +#include "X86IntelInstPrinter.h" +#include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "X86InstComments.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormattedStream.h" +#include <cctype> +using namespace llvm; + +#include "X86GenAsmWriter1.inc" + +void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + OS << getRegisterName(RegNo); +} + +void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, + StringRef Annot) { + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + uint64_t TSFlags = Desc.TSFlags; + + if (TSFlags & X86II::LOCK) + OS << "\tlock\n"; + + printInstruction(MI, OS); + + // Next always print the annotation. + printAnnotation(OS, Annot); + + // If verbose assembly is enabled, we can print some informative comments. + if (CommentStream) + EmitAnyX86InstComments(MI, *CommentStream, getRegisterName); +} + +void X86IntelInstPrinter::printSSECC(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm() & 0xf; + switch (Imm) { + default: llvm_unreachable("Invalid ssecc argument!"); + case 0: O << "eq"; break; + case 1: O << "lt"; break; + case 2: O << "le"; break; + case 3: O << "unord"; break; + case 4: O << "neq"; break; + case 5: O << "nlt"; break; + case 6: O << "nle"; break; + case 7: O << "ord"; break; + case 8: O << "eq_uq"; break; + case 9: O << "nge"; break; + case 0xa: O << "ngt"; break; + case 0xb: O << "false"; break; + case 0xc: O << "neq_oq"; break; + case 0xd: O << "ge"; break; + case 0xe: O << "gt"; break; + case 0xf: O << "true"; break; + } +} + +void X86IntelInstPrinter::printAVXCC(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm() & 0x1f; + switch (Imm) { + default: llvm_unreachable("Invalid avxcc argument!"); + case 0: O << "eq"; break; + case 1: O << "lt"; break; + case 2: O << "le"; break; + case 3: O << "unord"; break; + case 4: O << "neq"; break; + case 5: O << "nlt"; break; + case 6: O << "nle"; break; + case 7: O << "ord"; break; + case 8: O << "eq_uq"; break; + case 9: O << "nge"; break; + case 0xa: O << "ngt"; break; + case 0xb: O << "false"; break; + case 0xc: O << "neq_oq"; break; + case 0xd: O << "ge"; break; + case 0xe: O << "gt"; break; + case 0xf: O << "true"; break; + case 0x10: O << "eq_os"; break; + case 0x11: O << "lt_oq"; break; + case 0x12: O << "le_oq"; break; + case 0x13: O << "unord_s"; break; + case 0x14: O << "neq_us"; break; + case 0x15: O << "nlt_uq"; break; + case 0x16: O << "nle_uq"; break; + case 0x17: O << "ord_s"; break; + case 0x18: O << "eq_us"; break; + case 0x19: O << "nge_uq"; break; + case 0x1a: O << "ngt_uq"; break; + case 0x1b: O << "false_os"; break; + case 0x1c: O << "neq_os"; break; + case 0x1d: O << "ge_oq"; break; + case 0x1e: O << "gt_oq"; break; + case 0x1f: O << "true_us"; break; + } +} + +/// printPCRelImm - This is used to print an immediate value that ends up +/// being encoded as a pc-relative value. +void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isImm()) + O << Op.getImm(); + else { + assert(Op.isExpr() && "unknown pcrel immediate operand"); + // If a symbolic branch target was added as a constant expression then print + // that address in hex. + const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr()); + int64_t Address; + if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) { + O << "0x"; + O.write_hex(Address); + } + else { + // Otherwise, just print the expression. + O << *Op.getExpr(); + } + } +} + +static void PrintRegName(raw_ostream &O, StringRef RegName) { + for (unsigned i = 0, e = RegName.size(); i != e; ++i) + O << (char)toupper(RegName[i]); +} + +void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + PrintRegName(O, getRegisterName(Op.getReg())); + } else if (Op.isImm()) { + O << Op.getImm(); + } else { + assert(Op.isExpr() && "unknown operand kind in printOperand"); + O << *Op.getExpr(); + } +} + +void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &BaseReg = MI->getOperand(Op); + unsigned ScaleVal = MI->getOperand(Op+1).getImm(); + const MCOperand &IndexReg = MI->getOperand(Op+2); + const MCOperand &DispSpec = MI->getOperand(Op+3); + const MCOperand &SegReg = MI->getOperand(Op+4); + + // If this has a segment register, print it. + if (SegReg.getReg()) { + printOperand(MI, Op+4, O); + O << ':'; + } + + O << '['; + + bool NeedPlus = false; + if (BaseReg.getReg()) { + printOperand(MI, Op, O); + NeedPlus = true; + } + + if (IndexReg.getReg()) { + if (NeedPlus) O << " + "; + if (ScaleVal != 1) + O << ScaleVal << '*'; + printOperand(MI, Op+2, O); + NeedPlus = true; + } + + if (!DispSpec.isImm()) { + if (NeedPlus) O << " + "; + assert(DispSpec.isExpr() && "non-immediate displacement for LEA?"); + O << *DispSpec.getExpr(); + } else { + int64_t DispVal = DispSpec.getImm(); + if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) { + if (NeedPlus) { + if (DispVal > 0) + O << " + "; + else { + O << " - "; + DispVal = -DispVal; + } + } + O << DispVal; + } + } + + O << ']'; +} diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h new file mode 100644 index 0000000..bb769eb --- /dev/null +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h @@ -0,0 +1,96 @@ +//= X86IntelInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class prints an X86 MCInst to Intel style .s file syntax. +// +//===----------------------------------------------------------------------===// + +#ifndef X86_INTEL_INST_PRINTER_H +#define X86_INTEL_INST_PRINTER_H + +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { + +class MCOperand; + +class X86IntelInstPrinter : public MCInstPrinter { +public: + X86IntelInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + + virtual void printRegName(raw_ostream &OS, unsigned RegNo) const; + virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot); + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); + + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O); + void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O); + void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O); + void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O); + + void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "OPAQUE PTR "; + printMemReference(MI, OpNo, O); + } + + void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "BYTE PTR "; + printMemReference(MI, OpNo, O); + } + void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "WORD PTR "; + printMemReference(MI, OpNo, O); + } + void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "DWORD PTR "; + printMemReference(MI, OpNo, O); + } + void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "QWORD PTR "; + printMemReference(MI, OpNo, O); + } + void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "XMMWORD PTR "; + printMemReference(MI, OpNo, O); + } + void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "YMMWORD PTR "; + printMemReference(MI, OpNo, O); + } + void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "DWORD PTR "; + printMemReference(MI, OpNo, O); + } + void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "QWORD PTR "; + printMemReference(MI, OpNo, O); + } + void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "XWORD PTR "; + printMemReference(MI, OpNo, O); + } + void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "XMMWORD PTR "; + printMemReference(MI, OpNo, O); + } + void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "YMMWORD PTR "; + printMemReference(MI, OpNo, O); + } +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp new file mode 100644 index 0000000..598ddee --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -0,0 +1,476 @@ +//===-- X86AsmBackend.cpp - X86 Assembler Backend -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86FixupKinds.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCMachObjectWriter.h" +#include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCSectionCOFF.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/Object/MachOFormat.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +// Option to allow disabling arithmetic relaxation to workaround PR9807, which +// is useful when running bitwise comparison experiments on Darwin. We should be +// able to remove this once PR9807 is resolved. +static cl::opt<bool> +MCDisableArithRelaxation("mc-x86-disable-arith-relaxation", + cl::desc("Disable relaxation of arithmetic instruction for X86")); + +static unsigned getFixupKindLog2Size(unsigned Kind) { + switch (Kind) { + default: llvm_unreachable("invalid fixup kind!"); + case FK_PCRel_1: + case FK_SecRel_1: + case FK_Data_1: return 0; + case FK_PCRel_2: + case FK_SecRel_2: + case FK_Data_2: return 1; + case FK_PCRel_4: + case X86::reloc_riprel_4byte: + case X86::reloc_riprel_4byte_movq_load: + case X86::reloc_signed_4byte: + case X86::reloc_global_offset_table: + case FK_SecRel_4: + case FK_Data_4: return 2; + case FK_PCRel_8: + case FK_SecRel_8: + case FK_Data_8: return 3; + } +} + +namespace { + +class X86ELFObjectWriter : public MCELFObjectTargetWriter { +public: + X86ELFObjectWriter(bool is64Bit, uint8_t OSABI, uint16_t EMachine, + bool HasRelocationAddend, bool foobar) + : MCELFObjectTargetWriter(is64Bit, OSABI, EMachine, HasRelocationAddend) {} +}; + +class X86AsmBackend : public MCAsmBackend { + StringRef CPU; +public: + X86AsmBackend(const Target &T, StringRef _CPU) + : MCAsmBackend(), CPU(_CPU) {} + + unsigned getNumFixupKinds() const { + return X86::NumTargetFixupKinds; + } + + const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const { + const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = { + { "reloc_riprel_4byte", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel }, + { "reloc_riprel_4byte_movq_load", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel}, + { "reloc_signed_4byte", 0, 4 * 8, 0}, + { "reloc_global_offset_table", 0, 4 * 8, 0} + }; + + if (Kind < FirstTargetFixupKind) + return MCAsmBackend::getFixupKindInfo(Kind); + + assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() && + "Invalid kind!"); + return Infos[Kind - FirstTargetFixupKind]; + } + + void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value) const { + unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind()); + + assert(Fixup.getOffset() + Size <= DataSize && + "Invalid fixup offset!"); + + // Check that uppper bits are either all zeros or all ones. + // Specifically ignore overflow/underflow as long as the leakage is + // limited to the lower bits. This is to remain compatible with + // other assemblers. + assert(isIntN(Size * 8 + 1, Value) && + "Value does not fit in the Fixup field"); + + for (unsigned i = 0; i != Size; ++i) + Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8)); + } + + bool mayNeedRelaxation(const MCInst &Inst) const; + + bool fixupNeedsRelaxation(const MCFixup &Fixup, + uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const; + + void relaxInstruction(const MCInst &Inst, MCInst &Res) const; + + bool writeNopData(uint64_t Count, MCObjectWriter *OW) const; +}; +} // end anonymous namespace + +static unsigned getRelaxedOpcodeBranch(unsigned Op) { + switch (Op) { + default: + return Op; + + case X86::JAE_1: return X86::JAE_4; + case X86::JA_1: return X86::JA_4; + case X86::JBE_1: return X86::JBE_4; + case X86::JB_1: return X86::JB_4; + case X86::JE_1: return X86::JE_4; + case X86::JGE_1: return X86::JGE_4; + case X86::JG_1: return X86::JG_4; + case X86::JLE_1: return X86::JLE_4; + case X86::JL_1: return X86::JL_4; + case X86::JMP_1: return X86::JMP_4; + case X86::JNE_1: return X86::JNE_4; + case X86::JNO_1: return X86::JNO_4; + case X86::JNP_1: return X86::JNP_4; + case X86::JNS_1: return X86::JNS_4; + case X86::JO_1: return X86::JO_4; + case X86::JP_1: return X86::JP_4; + case X86::JS_1: return X86::JS_4; + } +} + +static unsigned getRelaxedOpcodeArith(unsigned Op) { + switch (Op) { + default: + return Op; + + // IMUL + case X86::IMUL16rri8: return X86::IMUL16rri; + case X86::IMUL16rmi8: return X86::IMUL16rmi; + case X86::IMUL32rri8: return X86::IMUL32rri; + case X86::IMUL32rmi8: return X86::IMUL32rmi; + case X86::IMUL64rri8: return X86::IMUL64rri32; + case X86::IMUL64rmi8: return X86::IMUL64rmi32; + + // AND + case X86::AND16ri8: return X86::AND16ri; + case X86::AND16mi8: return X86::AND16mi; + case X86::AND32ri8: return X86::AND32ri; + case X86::AND32mi8: return X86::AND32mi; + case X86::AND64ri8: return X86::AND64ri32; + case X86::AND64mi8: return X86::AND64mi32; + + // OR + case X86::OR16ri8: return X86::OR16ri; + case X86::OR16mi8: return X86::OR16mi; + case X86::OR32ri8: return X86::OR32ri; + case X86::OR32mi8: return X86::OR32mi; + case X86::OR64ri8: return X86::OR64ri32; + case X86::OR64mi8: return X86::OR64mi32; + + // XOR + case X86::XOR16ri8: return X86::XOR16ri; + case X86::XOR16mi8: return X86::XOR16mi; + case X86::XOR32ri8: return X86::XOR32ri; + case X86::XOR32mi8: return X86::XOR32mi; + case X86::XOR64ri8: return X86::XOR64ri32; + case X86::XOR64mi8: return X86::XOR64mi32; + + // ADD + case X86::ADD16ri8: return X86::ADD16ri; + case X86::ADD16mi8: return X86::ADD16mi; + case X86::ADD32ri8: return X86::ADD32ri; + case X86::ADD32mi8: return X86::ADD32mi; + case X86::ADD64ri8: return X86::ADD64ri32; + case X86::ADD64mi8: return X86::ADD64mi32; + + // SUB + case X86::SUB16ri8: return X86::SUB16ri; + case X86::SUB16mi8: return X86::SUB16mi; + case X86::SUB32ri8: return X86::SUB32ri; + case X86::SUB32mi8: return X86::SUB32mi; + case X86::SUB64ri8: return X86::SUB64ri32; + case X86::SUB64mi8: return X86::SUB64mi32; + + // CMP + case X86::CMP16ri8: return X86::CMP16ri; + case X86::CMP16mi8: return X86::CMP16mi; + case X86::CMP32ri8: return X86::CMP32ri; + case X86::CMP32mi8: return X86::CMP32mi; + case X86::CMP64ri8: return X86::CMP64ri32; + case X86::CMP64mi8: return X86::CMP64mi32; + + // PUSH + case X86::PUSHi8: return X86::PUSHi32; + case X86::PUSHi16: return X86::PUSHi32; + case X86::PUSH64i8: return X86::PUSH64i32; + case X86::PUSH64i16: return X86::PUSH64i32; + } +} + +static unsigned getRelaxedOpcode(unsigned Op) { + unsigned R = getRelaxedOpcodeArith(Op); + if (R != Op) + return R; + return getRelaxedOpcodeBranch(Op); +} + +bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst) const { + // Branches can always be relaxed. + if (getRelaxedOpcodeBranch(Inst.getOpcode()) != Inst.getOpcode()) + return true; + + if (MCDisableArithRelaxation) + return false; + + // Check if this instruction is ever relaxable. + if (getRelaxedOpcodeArith(Inst.getOpcode()) == Inst.getOpcode()) + return false; + + + // Check if it has an expression and is not RIP relative. + bool hasExp = false; + bool hasRIP = false; + for (unsigned i = 0; i < Inst.getNumOperands(); ++i) { + const MCOperand &Op = Inst.getOperand(i); + if (Op.isExpr()) + hasExp = true; + + if (Op.isReg() && Op.getReg() == X86::RIP) + hasRIP = true; + } + + // FIXME: Why exactly do we need the !hasRIP? Is it just a limitation on + // how we do relaxations? + return hasExp && !hasRIP; +} + +bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, + uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const { + // Relax if the value is too big for a (signed) i8. + return int64_t(Value) != int64_t(int8_t(Value)); +} + +// FIXME: Can tblgen help at all here to verify there aren't other instructions +// we can relax? +void X86AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const { + // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel. + unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode()); + + if (RelaxedOp == Inst.getOpcode()) { + SmallString<256> Tmp; + raw_svector_ostream OS(Tmp); + Inst.dump_pretty(OS); + OS << "\n"; + report_fatal_error("unexpected instruction to relax: " + OS.str()); + } + + Res = Inst; + Res.setOpcode(RelaxedOp); +} + +/// \brief Write a sequence of optimal nops to the output, covering \p Count +/// bytes. +/// \return - true on success, false on failure +bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { + static const uint8_t Nops[10][10] = { + // nop + {0x90}, + // xchg %ax,%ax + {0x66, 0x90}, + // nopl (%[re]ax) + {0x0f, 0x1f, 0x00}, + // nopl 0(%[re]ax) + {0x0f, 0x1f, 0x40, 0x00}, + // nopl 0(%[re]ax,%[re]ax,1) + {0x0f, 0x1f, 0x44, 0x00, 0x00}, + // nopw 0(%[re]ax,%[re]ax,1) + {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00}, + // nopl 0L(%[re]ax) + {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00}, + // nopl 0L(%[re]ax,%[re]ax,1) + {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, + // nopw 0L(%[re]ax,%[re]ax,1) + {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, + // nopw %cs:0L(%[re]ax,%[re]ax,1) + {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, + }; + + // This CPU doesnt support long nops. If needed add more. + // FIXME: Can we get this from the subtarget somehow? + if (CPU == "generic" || CPU == "i386" || CPU == "i486" || CPU == "i586" || + CPU == "pentium" || CPU == "pentium-mmx" || CPU == "geode") { + for (uint64_t i = 0; i < Count; ++i) + OW->Write8(0x90); + return true; + } + + // 15 is the longest single nop instruction. Emit as many 15-byte nops as + // needed, then emit a nop of the remaining length. + do { + const uint8_t ThisNopLength = (uint8_t) std::min(Count, (uint64_t) 15); + const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10; + for (uint8_t i = 0; i < Prefixes; i++) + OW->Write8(0x66); + const uint8_t Rest = ThisNopLength - Prefixes; + for (uint8_t i = 0; i < Rest; i++) + OW->Write8(Nops[Rest - 1][i]); + Count -= ThisNopLength; + } while (Count != 0); + + return true; +} + +/* *** */ + +namespace { +class ELFX86AsmBackend : public X86AsmBackend { +public: + uint8_t OSABI; + ELFX86AsmBackend(const Target &T, uint8_t _OSABI, StringRef CPU) + : X86AsmBackend(T, CPU), OSABI(_OSABI) { + HasReliableSymbolDifference = true; + } + + virtual bool doesSectionRequireSymbols(const MCSection &Section) const { + const MCSectionELF &ES = static_cast<const MCSectionELF&>(Section); + return ES.getFlags() & ELF::SHF_MERGE; + } +}; + +class ELFX86_32AsmBackend : public ELFX86AsmBackend { +public: + ELFX86_32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) + : ELFX86AsmBackend(T, OSABI, CPU) {} + + MCObjectWriter *createObjectWriter(raw_ostream &OS) const { + return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, ELF::EM_386); + } +}; + +class ELFX86_64AsmBackend : public ELFX86AsmBackend { +public: + ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) + : ELFX86AsmBackend(T, OSABI, CPU) {} + + MCObjectWriter *createObjectWriter(raw_ostream &OS) const { + return createX86ELFObjectWriter(OS, /*IsELF64*/ true, OSABI, ELF::EM_X86_64); + } +}; + +class WindowsX86AsmBackend : public X86AsmBackend { + bool Is64Bit; + +public: + WindowsX86AsmBackend(const Target &T, bool is64Bit, StringRef CPU) + : X86AsmBackend(T, CPU) + , Is64Bit(is64Bit) { + } + + MCObjectWriter *createObjectWriter(raw_ostream &OS) const { + return createX86WinCOFFObjectWriter(OS, Is64Bit); + } +}; + +class DarwinX86AsmBackend : public X86AsmBackend { +public: + DarwinX86AsmBackend(const Target &T, StringRef CPU) + : X86AsmBackend(T, CPU) { } +}; + +class DarwinX86_32AsmBackend : public DarwinX86AsmBackend { +public: + DarwinX86_32AsmBackend(const Target &T, StringRef CPU) + : DarwinX86AsmBackend(T, CPU) {} + + MCObjectWriter *createObjectWriter(raw_ostream &OS) const { + return createX86MachObjectWriter(OS, /*Is64Bit=*/false, + object::mach::CTM_i386, + object::mach::CSX86_ALL); + } +}; + +class DarwinX86_64AsmBackend : public DarwinX86AsmBackend { +public: + DarwinX86_64AsmBackend(const Target &T, StringRef CPU) + : DarwinX86AsmBackend(T, CPU) { + HasReliableSymbolDifference = true; + } + + MCObjectWriter *createObjectWriter(raw_ostream &OS) const { + return createX86MachObjectWriter(OS, /*Is64Bit=*/true, + object::mach::CTM_x86_64, + object::mach::CSX86_ALL); + } + + virtual bool doesSectionRequireSymbols(const MCSection &Section) const { + // Temporary labels in the string literals sections require symbols. The + // issue is that the x86_64 relocation format does not allow symbol + + // offset, and so the linker does not have enough information to resolve the + // access to the appropriate atom unless an external relocation is used. For + // non-cstring sections, we expect the compiler to use a non-temporary label + // for anything that could have an addend pointing outside the symbol. + // + // See <rdar://problem/4765733>. + const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section); + return SMO.getType() == MCSectionMachO::S_CSTRING_LITERALS; + } + + virtual bool isSectionAtomizable(const MCSection &Section) const { + const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section); + // Fixed sized data sections are uniqued, they cannot be diced into atoms. + switch (SMO.getType()) { + default: + return true; + + case MCSectionMachO::S_4BYTE_LITERALS: + case MCSectionMachO::S_8BYTE_LITERALS: + case MCSectionMachO::S_16BYTE_LITERALS: + case MCSectionMachO::S_LITERAL_POINTERS: + case MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS: + case MCSectionMachO::S_LAZY_SYMBOL_POINTERS: + case MCSectionMachO::S_MOD_INIT_FUNC_POINTERS: + case MCSectionMachO::S_MOD_TERM_FUNC_POINTERS: + case MCSectionMachO::S_INTERPOSING: + return false; + } + } +}; + +} // end anonymous namespace + +MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT, StringRef CPU) { + Triple TheTriple(TT); + + if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) + return new DarwinX86_32AsmBackend(T, CPU); + + if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF) + return new WindowsX86AsmBackend(T, false, CPU); + + uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); + return new ELFX86_32AsmBackend(T, OSABI, CPU); +} + +MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT, StringRef CPU) { + Triple TheTriple(TT); + + if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) + return new DarwinX86_64AsmBackend(T, CPU); + + if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF) + return new WindowsX86AsmBackend(T, true, CPU); + + uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); + return new ELFX86_64AsmBackend(T, OSABI, CPU); +} diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h new file mode 100644 index 0000000..d8f7278 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -0,0 +1,643 @@ +//===-- X86BaseInfo.h - Top level definitions for X86 -------- --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains small standalone helper functions and enum definitions for +// the X86 target useful for the compiler back-end and the MC libraries. +// As such, it deliberately does not include references to LLVM core +// code gen types, passes, etc.. +// +//===----------------------------------------------------------------------===// + +#ifndef X86BASEINFO_H +#define X86BASEINFO_H + +#include "X86MCTargetDesc.h" +#include "llvm/Support/DataTypes.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/MC/MCInstrInfo.h" + +namespace llvm { + +namespace X86 { + // Enums for memory operand decoding. Each memory operand is represented with + // a 5 operand sequence in the form: + // [BaseReg, ScaleAmt, IndexReg, Disp, Segment] + // These enums help decode this. + enum { + AddrBaseReg = 0, + AddrScaleAmt = 1, + AddrIndexReg = 2, + AddrDisp = 3, + + /// AddrSegmentReg - The operand # of the segment in the memory operand. + AddrSegmentReg = 4, + + /// AddrNumOperands - Total number of operands in a memory reference. + AddrNumOperands = 5 + }; +} // end namespace X86; + +/// X86II - This namespace holds all of the target specific flags that +/// instruction info tracks. +/// +namespace X86II { + /// Target Operand Flag enum. + enum TOF { + //===------------------------------------------------------------------===// + // X86 Specific MachineOperand flags. + + MO_NO_FLAG, + + /// MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a + /// relocation of: + /// SYMBOL_LABEL + [. - PICBASELABEL] + MO_GOT_ABSOLUTE_ADDRESS, + + /// MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the + /// immediate should get the value of the symbol minus the PIC base label: + /// SYMBOL_LABEL - PICBASELABEL + MO_PIC_BASE_OFFSET, + + /// MO_GOT - On a symbol operand this indicates that the immediate is the + /// offset to the GOT entry for the symbol name from the base of the GOT. + /// + /// See the X86-64 ELF ABI supplement for more details. + /// SYMBOL_LABEL @GOT + MO_GOT, + + /// MO_GOTOFF - On a symbol operand this indicates that the immediate is + /// the offset to the location of the symbol name from the base of the GOT. + /// + /// See the X86-64 ELF ABI supplement for more details. + /// SYMBOL_LABEL @GOTOFF + MO_GOTOFF, + + /// MO_GOTPCREL - On a symbol operand this indicates that the immediate is + /// offset to the GOT entry for the symbol name from the current code + /// location. + /// + /// See the X86-64 ELF ABI supplement for more details. + /// SYMBOL_LABEL @GOTPCREL + MO_GOTPCREL, + + /// MO_PLT - On a symbol operand this indicates that the immediate is + /// offset to the PLT entry of symbol name from the current code location. + /// + /// See the X86-64 ELF ABI supplement for more details. + /// SYMBOL_LABEL @PLT + MO_PLT, + + /// MO_TLSGD - On a symbol operand this indicates that the immediate is + /// the offset of the GOT entry with the TLS index structure that contains + /// the module number and variable offset for the symbol. Used in the + /// general dynamic TLS access model. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @TLSGD + MO_TLSGD, + + /// MO_TLSLD - On a symbol operand this indicates that the immediate is + /// the offset of the GOT entry with the TLS index for the module that + /// contains the symbol. When this index is passed to a call to + /// __tls_get_addr, the function will return the base address of the TLS + /// block for the symbol. Used in the x86-64 local dynamic TLS access model. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @TLSLD + MO_TLSLD, + + /// MO_TLSLDM - On a symbol operand this indicates that the immediate is + /// the offset of the GOT entry with the TLS index for the module that + /// contains the symbol. When this index is passed to a call to + /// ___tls_get_addr, the function will return the base address of the TLS + /// block for the symbol. Used in the IA32 local dynamic TLS access model. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @TLSLDM + MO_TLSLDM, + + /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is + /// the offset of the GOT entry with the thread-pointer offset for the + /// symbol. Used in the x86-64 initial exec TLS access model. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @GOTTPOFF + MO_GOTTPOFF, + + /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is + /// the absolute address of the GOT entry with the negative thread-pointer + /// offset for the symbol. Used in the non-PIC IA32 initial exec TLS access + /// model. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @INDNTPOFF + MO_INDNTPOFF, + + /// MO_TPOFF - On a symbol operand this indicates that the immediate is + /// the thread-pointer offset for the symbol. Used in the x86-64 local + /// exec TLS access model. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @TPOFF + MO_TPOFF, + + /// MO_DTPOFF - On a symbol operand this indicates that the immediate is + /// the offset of the GOT entry with the TLS offset of the symbol. Used + /// in the local dynamic TLS access model. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @DTPOFF + MO_DTPOFF, + + /// MO_NTPOFF - On a symbol operand this indicates that the immediate is + /// the negative thread-pointer offset for the symbol. Used in the IA32 + /// local exec TLS access model. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @NTPOFF + MO_NTPOFF, + + /// MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is + /// the offset of the GOT entry with the negative thread-pointer offset for + /// the symbol. Used in the PIC IA32 initial exec TLS access model. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @GOTNTPOFF + MO_GOTNTPOFF, + + /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the + /// reference is actually to the "__imp_FOO" symbol. This is used for + /// dllimport linkage on windows. + MO_DLLIMPORT, + + /// MO_DARWIN_STUB - On a symbol operand "FOO", this indicates that the + /// reference is actually to the "FOO$stub" symbol. This is used for calls + /// and jumps to external functions on Tiger and earlier. + MO_DARWIN_STUB, + + /// MO_DARWIN_NONLAZY - On a symbol operand "FOO", this indicates that the + /// reference is actually to the "FOO$non_lazy_ptr" symbol, which is a + /// non-PIC-base-relative reference to a non-hidden dyld lazy pointer stub. + MO_DARWIN_NONLAZY, + + /// MO_DARWIN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this indicates + /// that the reference is actually to "FOO$non_lazy_ptr - PICBASE", which is + /// a PIC-base-relative reference to a non-hidden dyld lazy pointer stub. + MO_DARWIN_NONLAZY_PIC_BASE, + + /// MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this + /// indicates that the reference is actually to "FOO$non_lazy_ptr -PICBASE", + /// which is a PIC-base-relative reference to a hidden dyld lazy pointer + /// stub. + MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE, + + /// MO_TLVP - On a symbol operand this indicates that the immediate is + /// some TLS offset. + /// + /// This is the TLS offset for the Darwin TLS mechanism. + MO_TLVP, + + /// MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate + /// is some TLS offset from the picbase. + /// + /// This is the 32-bit TLS offset for Darwin TLS in PIC mode. + MO_TLVP_PIC_BASE, + + /// MO_SECREL - On a symbol operand this indicates that the immediate is + /// the offset from beginning of section. + /// + /// This is the TLS offset for the COFF/Windows TLS mechanism. + MO_SECREL + }; + + enum { + //===------------------------------------------------------------------===// + // Instruction encodings. These are the standard/most common forms for X86 + // instructions. + // + + // PseudoFrm - This represents an instruction that is a pseudo instruction + // or one that has not been implemented yet. It is illegal to code generate + // it, but tolerated for intermediate implementation stages. + Pseudo = 0, + + /// Raw - This form is for instructions that don't have any operands, so + /// they are just a fixed opcode value, like 'leave'. + RawFrm = 1, + + /// AddRegFrm - This form is used for instructions like 'push r32' that have + /// their one register operand added to their opcode. + AddRegFrm = 2, + + /// MRMDestReg - This form is used for instructions that use the Mod/RM byte + /// to specify a destination, which in this case is a register. + /// + MRMDestReg = 3, + + /// MRMDestMem - This form is used for instructions that use the Mod/RM byte + /// to specify a destination, which in this case is memory. + /// + MRMDestMem = 4, + + /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte + /// to specify a source, which in this case is a register. + /// + MRMSrcReg = 5, + + /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte + /// to specify a source, which in this case is memory. + /// + MRMSrcMem = 6, + + /// MRM[0-7][rm] - These forms are used to represent instructions that use + /// a Mod/RM byte, and use the middle field to hold extended opcode + /// information. In the intel manual these are represented as /0, /1, ... + /// + + // First, instructions that operate on a register r/m operand... + MRM0r = 16, MRM1r = 17, MRM2r = 18, MRM3r = 19, // Format /0 /1 /2 /3 + MRM4r = 20, MRM5r = 21, MRM6r = 22, MRM7r = 23, // Format /4 /5 /6 /7 + + // Next, instructions that operate on a memory r/m operand... + MRM0m = 24, MRM1m = 25, MRM2m = 26, MRM3m = 27, // Format /0 /1 /2 /3 + MRM4m = 28, MRM5m = 29, MRM6m = 30, MRM7m = 31, // Format /4 /5 /6 /7 + + // MRMInitReg - This form is used for instructions whose source and + // destinations are the same register. + MRMInitReg = 32, + + //// MRM_XX - A mod/rm byte of exactly 0xXX. + MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35, MRM_C4 = 36, + MRM_C8 = 37, MRM_C9 = 38, MRM_CA = 39, MRM_CB = 40, + MRM_E8 = 41, MRM_F0 = 42, MRM_F8 = 45, MRM_F9 = 46, + MRM_D0 = 47, MRM_D1 = 48, MRM_D4 = 49, MRM_D5 = 50, + MRM_D6 = 51, MRM_D8 = 52, MRM_D9 = 53, MRM_DA = 54, + MRM_DB = 55, MRM_DC = 56, MRM_DD = 57, MRM_DE = 58, + MRM_DF = 59, + + /// RawFrmImm8 - This is used for the ENTER instruction, which has two + /// immediates, the first of which is a 16-bit immediate (specified by + /// the imm encoding) and the second is a 8-bit fixed value. + RawFrmImm8 = 43, + + /// RawFrmImm16 - This is used for CALL FAR instructions, which have two + /// immediates, the first of which is a 16 or 32-bit immediate (specified by + /// the imm encoding) and the second is a 16-bit fixed value. In the AMD + /// manual, this operand is described as pntr16:32 and pntr16:16 + RawFrmImm16 = 44, + + FormMask = 63, + + //===------------------------------------------------------------------===// + // Actual flags... + + // OpSize - Set if this instruction requires an operand size prefix (0x66), + // which most often indicates that the instruction operates on 16 bit data + // instead of 32 bit data. + OpSize = 1 << 6, + + // AsSize - Set if this instruction requires an operand size prefix (0x67), + // which most often indicates that the instruction address 16 bit address + // instead of 32 bit address (or 32 bit address in 64 bit mode). + AdSize = 1 << 7, + + //===------------------------------------------------------------------===// + // Op0Mask - There are several prefix bytes that are used to form two byte + // opcodes. These are currently 0x0F, 0xF3, and 0xD8-0xDF. This mask is + // used to obtain the setting of this field. If no bits in this field is + // set, there is no prefix byte for obtaining a multibyte opcode. + // + Op0Shift = 8, + Op0Mask = 0x1F << Op0Shift, + + // TB - TwoByte - Set if this instruction has a two byte opcode, which + // starts with a 0x0F byte before the real opcode. + TB = 1 << Op0Shift, + + // REP - The 0xF3 prefix byte indicating repetition of the following + // instruction. + REP = 2 << Op0Shift, + + // D8-DF - These escape opcodes are used by the floating point unit. These + // values must remain sequential. + D8 = 3 << Op0Shift, D9 = 4 << Op0Shift, + DA = 5 << Op0Shift, DB = 6 << Op0Shift, + DC = 7 << Op0Shift, DD = 8 << Op0Shift, + DE = 9 << Op0Shift, DF = 10 << Op0Shift, + + // XS, XD - These prefix codes are for single and double precision scalar + // floating point operations performed in the SSE registers. + XD = 11 << Op0Shift, XS = 12 << Op0Shift, + + // T8, TA, A6, A7 - Prefix after the 0x0F prefix. + T8 = 13 << Op0Shift, TA = 14 << Op0Shift, + A6 = 15 << Op0Shift, A7 = 16 << Op0Shift, + + // T8XD - Prefix before and after 0x0F. Combination of T8 and XD. + T8XD = 17 << Op0Shift, + + // T8XS - Prefix before and after 0x0F. Combination of T8 and XS. + T8XS = 18 << Op0Shift, + + // TAXD - Prefix before and after 0x0F. Combination of TA and XD. + TAXD = 19 << Op0Shift, + + // XOP8 - Prefix to include use of imm byte. + XOP8 = 20 << Op0Shift, + + // XOP9 - Prefix to exclude use of imm byte. + XOP9 = 21 << Op0Shift, + + //===------------------------------------------------------------------===// + // REX_W - REX prefixes are instruction prefixes used in 64-bit mode. + // They are used to specify GPRs and SSE registers, 64-bit operand size, + // etc. We only cares about REX.W and REX.R bits and only the former is + // statically determined. + // + REXShift = Op0Shift + 5, + REX_W = 1 << REXShift, + + //===------------------------------------------------------------------===// + // This three-bit field describes the size of an immediate operand. Zero is + // unused so that we can tell if we forgot to set a value. + ImmShift = REXShift + 1, + ImmMask = 7 << ImmShift, + Imm8 = 1 << ImmShift, + Imm8PCRel = 2 << ImmShift, + Imm16 = 3 << ImmShift, + Imm16PCRel = 4 << ImmShift, + Imm32 = 5 << ImmShift, + Imm32PCRel = 6 << ImmShift, + Imm64 = 7 << ImmShift, + + //===------------------------------------------------------------------===// + // FP Instruction Classification... Zero is non-fp instruction. + + // FPTypeMask - Mask for all of the FP types... + FPTypeShift = ImmShift + 3, + FPTypeMask = 7 << FPTypeShift, + + // NotFP - The default, set for instructions that do not use FP registers. + NotFP = 0 << FPTypeShift, + + // ZeroArgFP - 0 arg FP instruction which implicitly pushes ST(0), f.e. fld0 + ZeroArgFP = 1 << FPTypeShift, + + // OneArgFP - 1 arg FP instructions which implicitly read ST(0), such as fst + OneArgFP = 2 << FPTypeShift, + + // OneArgFPRW - 1 arg FP instruction which implicitly read ST(0) and write a + // result back to ST(0). For example, fcos, fsqrt, etc. + // + OneArgFPRW = 3 << FPTypeShift, + + // TwoArgFP - 2 arg FP instructions which implicitly read ST(0), and an + // explicit argument, storing the result to either ST(0) or the implicit + // argument. For example: fadd, fsub, fmul, etc... + TwoArgFP = 4 << FPTypeShift, + + // CompareFP - 2 arg FP instructions which implicitly read ST(0) and an + // explicit argument, but have no destination. Example: fucom, fucomi, ... + CompareFP = 5 << FPTypeShift, + + // CondMovFP - "2 operand" floating point conditional move instructions. + CondMovFP = 6 << FPTypeShift, + + // SpecialFP - Special instruction forms. Dispatch by opcode explicitly. + SpecialFP = 7 << FPTypeShift, + + // Lock prefix + LOCKShift = FPTypeShift + 3, + LOCK = 1 << LOCKShift, + + // Segment override prefixes. Currently we just need ability to address + // stuff in gs and fs segments. + SegOvrShift = LOCKShift + 1, + SegOvrMask = 3 << SegOvrShift, + FS = 1 << SegOvrShift, + GS = 2 << SegOvrShift, + + // Execution domain for SSE instructions in bits 23, 24. + // 0 in bits 23-24 means normal, non-SSE instruction. + SSEDomainShift = SegOvrShift + 2, + + OpcodeShift = SSEDomainShift + 2, + + //===------------------------------------------------------------------===// + /// VEX - The opcode prefix used by AVX instructions + VEXShift = OpcodeShift + 8, + VEX = 1U << 0, + + /// VEX_W - Has a opcode specific functionality, but is used in the same + /// way as REX_W is for regular SSE instructions. + VEX_W = 1U << 1, + + /// VEX_4V - Used to specify an additional AVX/SSE register. Several 2 + /// address instructions in SSE are represented as 3 address ones in AVX + /// and the additional register is encoded in VEX_VVVV prefix. + VEX_4V = 1U << 2, + + /// VEX_4VOp3 - Similar to VEX_4V, but used on instructions that encode + /// operand 3 with VEX.vvvv. + VEX_4VOp3 = 1U << 3, + + /// VEX_I8IMM - Specifies that the last register used in a AVX instruction, + /// must be encoded in the i8 immediate field. This usually happens in + /// instructions with 4 operands. + VEX_I8IMM = 1U << 4, + + /// VEX_L - Stands for a bit in the VEX opcode prefix meaning the current + /// instruction uses 256-bit wide registers. This is usually auto detected + /// if a VR256 register is used, but some AVX instructions also have this + /// field marked when using a f256 memory references. + VEX_L = 1U << 5, + + // VEX_LIG - Specifies that this instruction ignores the L-bit in the VEX + // prefix. Usually used for scalar instructions. Needed by disassembler. + VEX_LIG = 1U << 6, + + /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the + /// wacky 0x0F 0x0F prefix for 3DNow! instructions. The manual documents + /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction + /// storing a classifier in the imm8 field. To simplify our implementation, + /// we handle this by storeing the classifier in the opcode field and using + /// this flag to indicate that the encoder should do the wacky 3DNow! thing. + Has3DNow0F0FOpcode = 1U << 7, + + /// MemOp4 - Used to indicate swapping of operand 3 and 4 to be encoded in + /// ModRM or I8IMM. This is used for FMA4 and XOP instructions. + MemOp4 = 1U << 8, + + /// XOP - Opcode prefix used by XOP instructions. + XOP = 1U << 9 + + }; + + // getBaseOpcodeFor - This function returns the "base" X86 opcode for the + // specified machine instruction. + // + inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) { + return TSFlags >> X86II::OpcodeShift; + } + + inline bool hasImm(uint64_t TSFlags) { + return (TSFlags & X86II::ImmMask) != 0; + } + + /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field + /// of the specified instruction. + inline unsigned getSizeOfImm(uint64_t TSFlags) { + switch (TSFlags & X86II::ImmMask) { + default: llvm_unreachable("Unknown immediate size"); + case X86II::Imm8: + case X86II::Imm8PCRel: return 1; + case X86II::Imm16: + case X86II::Imm16PCRel: return 2; + case X86II::Imm32: + case X86II::Imm32PCRel: return 4; + case X86II::Imm64: return 8; + } + } + + /// isImmPCRel - Return true if the immediate of the specified instruction's + /// TSFlags indicates that it is pc relative. + inline unsigned isImmPCRel(uint64_t TSFlags) { + switch (TSFlags & X86II::ImmMask) { + default: llvm_unreachable("Unknown immediate size"); + case X86II::Imm8PCRel: + case X86II::Imm16PCRel: + case X86II::Imm32PCRel: + return true; + case X86II::Imm8: + case X86II::Imm16: + case X86II::Imm32: + case X86II::Imm64: + return false; + } + } + + /// getOperandBias - compute any additional adjustment needed to + /// the offset to the start of the memory operand + /// in this instruction. + /// If this is a two-address instruction,skip one of the register operands. + /// FIXME: This should be handled during MCInst lowering. + inline int getOperandBias(const MCInstrDesc& Desc) + { + unsigned NumOps = Desc.getNumOperands(); + unsigned CurOp = 0; + if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0) + ++CurOp; + else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) { + assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1); + // Special case for GATHER with 2 TIED_TO operands + // Skip the first 2 operands: dst, mask_wb + CurOp += 2; + } + return CurOp; + } + + /// getMemoryOperandNo - The function returns the MCInst operand # for the + /// first field of the memory operand. If the instruction doesn't have a + /// memory operand, this returns -1. + /// + /// Note that this ignores tied operands. If there is a tied register which + /// is duplicated in the MCInst (e.g. "EAX = addl EAX, [mem]") it is only + /// counted as one operand. + /// + inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) { + switch (TSFlags & X86II::FormMask) { + case X86II::MRMInitReg: + // FIXME: Remove this form. + return -1; + default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!"); + case X86II::Pseudo: + case X86II::RawFrm: + case X86II::AddRegFrm: + case X86II::MRMDestReg: + case X86II::MRMSrcReg: + case X86II::RawFrmImm8: + case X86II::RawFrmImm16: + return -1; + case X86II::MRMDestMem: + return 0; + case X86II::MRMSrcMem: { + bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; + bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; + unsigned FirstMemOp = 1; + if (HasVEX_4V) + ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV). + if (HasMemOp4) + ++FirstMemOp;// Skip the register source (which is encoded in I8IMM). + + // FIXME: Maybe lea should have its own form? This is a horrible hack. + //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r || + // Opcode == X86::LEA16r || Opcode == X86::LEA32r) + return FirstMemOp; + } + case X86II::MRM0r: case X86II::MRM1r: + case X86II::MRM2r: case X86II::MRM3r: + case X86II::MRM4r: case X86II::MRM5r: + case X86II::MRM6r: case X86II::MRM7r: + return -1; + case X86II::MRM0m: case X86II::MRM1m: + case X86II::MRM2m: case X86II::MRM3m: + case X86II::MRM4m: case X86II::MRM5m: + case X86II::MRM6m: case X86II::MRM7m: { + bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; + unsigned FirstMemOp = 0; + if (HasVEX_4V) + ++FirstMemOp;// Skip the register dest (which is encoded in VEX_VVVV). + return FirstMemOp; + } + case X86II::MRM_C1: case X86II::MRM_C2: case X86II::MRM_C3: + case X86II::MRM_C4: case X86II::MRM_C8: case X86II::MRM_C9: + case X86II::MRM_CA: case X86II::MRM_CB: case X86II::MRM_E8: + case X86II::MRM_F0: case X86II::MRM_F8: case X86II::MRM_F9: + case X86II::MRM_D0: case X86II::MRM_D1: case X86II::MRM_D4: + case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D8: + case X86II::MRM_D9: case X86II::MRM_DA: case X86II::MRM_DB: + case X86II::MRM_DC: case X86II::MRM_DD: case X86II::MRM_DE: + case X86II::MRM_DF: + return -1; + } + } + + /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or + /// higher) register? e.g. r8, xmm8, xmm13, etc. + inline bool isX86_64ExtendedReg(unsigned RegNo) { + switch (RegNo) { + default: break; + case X86::R8: case X86::R9: case X86::R10: case X86::R11: + case X86::R12: case X86::R13: case X86::R14: case X86::R15: + case X86::R8D: case X86::R9D: case X86::R10D: case X86::R11D: + case X86::R12D: case X86::R13D: case X86::R14D: case X86::R15D: + case X86::R8W: case X86::R9W: case X86::R10W: case X86::R11W: + case X86::R12W: case X86::R13W: case X86::R14W: case X86::R15W: + case X86::R8B: case X86::R9B: case X86::R10B: case X86::R11B: + case X86::R12B: case X86::R13B: case X86::R14B: case X86::R15B: + case X86::XMM8: case X86::XMM9: case X86::XMM10: case X86::XMM11: + case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15: + case X86::YMM8: case X86::YMM9: case X86::YMM10: case X86::YMM11: + case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15: + case X86::CR8: case X86::CR9: case X86::CR10: case X86::CR11: + case X86::CR12: case X86::CR13: case X86::CR14: case X86::CR15: + return true; + } + return false; + } + + inline bool isX86_64NonExtLowByteReg(unsigned reg) { + return (reg == X86::SPL || reg == X86::BPL || + reg == X86::SIL || reg == X86::DIL); + } +} + +} // end namespace llvm; + +#endif diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp new file mode 100644 index 0000000..de80dd8 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -0,0 +1,227 @@ +//===-- X86ELFObjectWriter.cpp - X86 ELF Writer ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86FixupKinds.h" +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; + +namespace { + class X86ELFObjectWriter : public MCELFObjectTargetWriter { + public: + X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine); + + virtual ~X86ELFObjectWriter(); + protected: + virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, + bool IsPCRel, bool IsRelocWithSymbol, + int64_t Addend) const; + }; +} + +X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, + uint16_t EMachine) + : MCELFObjectTargetWriter(IsELF64, OSABI, EMachine, + // Only i386 uses Rel instead of RelA. + /*HasRelocationAddend*/ EMachine != ELF::EM_386) {} + +X86ELFObjectWriter::~X86ELFObjectWriter() +{} + +unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, + const MCFixup &Fixup, + bool IsPCRel, + bool IsRelocWithSymbol, + int64_t Addend) const { + // determine the type of the relocation + + MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ? + MCSymbolRefExpr::VK_None : Target.getSymA()->getKind(); + unsigned Type; + if (getEMachine() == ELF::EM_X86_64) { + if (IsPCRel) { + switch ((unsigned)Fixup.getKind()) { + default: llvm_unreachable("invalid fixup kind!"); + + case FK_Data_8: Type = ELF::R_X86_64_PC64; break; + case FK_Data_4: Type = ELF::R_X86_64_PC32; break; + case FK_Data_2: Type = ELF::R_X86_64_PC16; break; + + case FK_PCRel_8: + assert(Modifier == MCSymbolRefExpr::VK_None); + Type = ELF::R_X86_64_PC64; + break; + case X86::reloc_signed_4byte: + case X86::reloc_riprel_4byte_movq_load: + case X86::reloc_riprel_4byte: + case FK_PCRel_4: + switch (Modifier) { + default: + llvm_unreachable("Unimplemented"); + case MCSymbolRefExpr::VK_None: + Type = ELF::R_X86_64_PC32; + break; + case MCSymbolRefExpr::VK_PLT: + Type = ELF::R_X86_64_PLT32; + break; + case MCSymbolRefExpr::VK_GOTPCREL: + Type = ELF::R_X86_64_GOTPCREL; + break; + case MCSymbolRefExpr::VK_GOTTPOFF: + Type = ELF::R_X86_64_GOTTPOFF; + break; + case MCSymbolRefExpr::VK_TLSGD: + Type = ELF::R_X86_64_TLSGD; + break; + case MCSymbolRefExpr::VK_TLSLD: + Type = ELF::R_X86_64_TLSLD; + break; + } + break; + case FK_PCRel_2: + assert(Modifier == MCSymbolRefExpr::VK_None); + Type = ELF::R_X86_64_PC16; + break; + case FK_PCRel_1: + assert(Modifier == MCSymbolRefExpr::VK_None); + Type = ELF::R_X86_64_PC8; + break; + } + } else { + switch ((unsigned)Fixup.getKind()) { + default: llvm_unreachable("invalid fixup kind!"); + case FK_Data_8: Type = ELF::R_X86_64_64; break; + case X86::reloc_signed_4byte: + switch (Modifier) { + default: + llvm_unreachable("Unimplemented"); + case MCSymbolRefExpr::VK_None: + Type = ELF::R_X86_64_32S; + break; + case MCSymbolRefExpr::VK_GOT: + Type = ELF::R_X86_64_GOT32; + break; + case MCSymbolRefExpr::VK_GOTPCREL: + Type = ELF::R_X86_64_GOTPCREL; + break; + case MCSymbolRefExpr::VK_TPOFF: + Type = ELF::R_X86_64_TPOFF32; + break; + case MCSymbolRefExpr::VK_DTPOFF: + Type = ELF::R_X86_64_DTPOFF32; + break; + } + break; + case FK_Data_4: + Type = ELF::R_X86_64_32; + break; + case FK_Data_2: Type = ELF::R_X86_64_16; break; + case FK_PCRel_1: + case FK_Data_1: Type = ELF::R_X86_64_8; break; + } + } + } else if (getEMachine() == ELF::EM_386) { + if (IsPCRel) { + switch ((unsigned)Fixup.getKind()) { + default: llvm_unreachable("invalid fixup kind!"); + + case X86::reloc_global_offset_table: + Type = ELF::R_386_GOTPC; + break; + + case X86::reloc_signed_4byte: + case FK_PCRel_4: + case FK_Data_4: + switch (Modifier) { + default: + llvm_unreachable("Unimplemented"); + case MCSymbolRefExpr::VK_None: + Type = ELF::R_386_PC32; + break; + case MCSymbolRefExpr::VK_PLT: + Type = ELF::R_386_PLT32; + break; + } + break; + } + } else { + switch ((unsigned)Fixup.getKind()) { + default: llvm_unreachable("invalid fixup kind!"); + + case X86::reloc_global_offset_table: + Type = ELF::R_386_GOTPC; + break; + + // FIXME: Should we avoid selecting reloc_signed_4byte in 32 bit mode + // instead? + case X86::reloc_signed_4byte: + case FK_PCRel_4: + case FK_Data_4: + switch (Modifier) { + default: + llvm_unreachable("Unimplemented"); + case MCSymbolRefExpr::VK_None: + Type = ELF::R_386_32; + break; + case MCSymbolRefExpr::VK_GOT: + Type = ELF::R_386_GOT32; + break; + case MCSymbolRefExpr::VK_GOTOFF: + Type = ELF::R_386_GOTOFF; + break; + case MCSymbolRefExpr::VK_TLSGD: + Type = ELF::R_386_TLS_GD; + break; + case MCSymbolRefExpr::VK_TPOFF: + Type = ELF::R_386_TLS_LE_32; + break; + case MCSymbolRefExpr::VK_INDNTPOFF: + Type = ELF::R_386_TLS_IE; + break; + case MCSymbolRefExpr::VK_NTPOFF: + Type = ELF::R_386_TLS_LE; + break; + case MCSymbolRefExpr::VK_GOTNTPOFF: + Type = ELF::R_386_TLS_GOTIE; + break; + case MCSymbolRefExpr::VK_TLSLDM: + Type = ELF::R_386_TLS_LDM; + break; + case MCSymbolRefExpr::VK_DTPOFF: + Type = ELF::R_386_TLS_LDO_32; + break; + case MCSymbolRefExpr::VK_GOTTPOFF: + Type = ELF::R_386_TLS_IE_32; + break; + } + break; + case FK_Data_2: Type = ELF::R_386_16; break; + case FK_PCRel_1: + case FK_Data_1: Type = ELF::R_386_8; break; + } + } + } else + llvm_unreachable("Unsupported ELF machine type."); + + return Type; +} + +MCObjectWriter *llvm::createX86ELFObjectWriter(raw_ostream &OS, + bool IsELF64, + uint8_t OSABI, + uint16_t EMachine) { + MCELFObjectTargetWriter *MOTW = + new X86ELFObjectWriter(IsELF64, OSABI, EMachine); + return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/true); +} diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h new file mode 100644 index 0000000..f2e34cb --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h @@ -0,0 +1,33 @@ +//===-- X86FixupKinds.h - X86 Specific Fixup Entries ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_X86_X86FIXUPKINDS_H +#define LLVM_X86_X86FIXUPKINDS_H + +#include "llvm/MC/MCFixup.h" + +namespace llvm { +namespace X86 { +enum Fixups { + reloc_riprel_4byte = FirstTargetFixupKind, // 32-bit rip-relative + reloc_riprel_4byte_movq_load, // 32-bit rip-relative in movq + reloc_signed_4byte, // 32-bit signed. Unlike FK_Data_4 + // this will be sign extended at + // runtime. + reloc_global_offset_table, // 32-bit, relative to the start + // of the instruction. Used only + // for _GLOBAL_OFFSET_TABLE_. + // Marker + LastTargetFixupKind, + NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind +}; +} +} + +#endif diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp new file mode 100644 index 0000000..7815ae9 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -0,0 +1,158 @@ +//===-- X86MCAsmInfo.cpp - X86 asm properties -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of the X86MCAsmInfo properties. +// +//===----------------------------------------------------------------------===// + +#include "X86MCAsmInfo.h" +#include "llvm/ADT/Triple.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ELF.h" +using namespace llvm; + +enum AsmWriterFlavorTy { + // Note: This numbering has to match the GCC assembler dialects for inline + // asm alternatives to work right. + ATT = 0, Intel = 1 +}; + +static cl::opt<AsmWriterFlavorTy> +AsmWriterFlavor("x86-asm-syntax", cl::init(ATT), + cl::desc("Choose style of code to emit from X86 backend:"), + cl::values(clEnumValN(ATT, "att", "Emit AT&T-style assembly"), + clEnumValN(Intel, "intel", "Emit Intel-style assembly"), + clEnumValEnd)); + +static cl::opt<bool> +MarkedJTDataRegions("mark-data-regions", cl::init(false), + cl::desc("Mark code section jump table data regions."), + cl::Hidden); + +void X86MCAsmInfoDarwin::anchor() { } + +X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) { + bool is64Bit = T.getArch() == Triple::x86_64; + if (is64Bit) + PointerSize = CalleeSaveStackSlotSize = 8; + + AssemblerDialect = AsmWriterFlavor; + + TextAlignFillValue = 0x90; + + if (!is64Bit) + Data64bitsDirective = 0; // we can't emit a 64-bit unit + + // Use ## as a comment string so that .s files generated by llvm can go + // through the GCC preprocessor without causing an error. This is needed + // because "clang foo.s" runs the C preprocessor, which is usually reserved + // for .S files on other systems. Perhaps this is because the file system + // wasn't always case preserving or something. + CommentString = "##"; + PCSymbol = "."; + + SupportsDebugInformation = true; + DwarfUsesInlineInfoSection = true; + UseDataRegionDirectives = MarkedJTDataRegions; + + // Exceptions handling + ExceptionsType = ExceptionHandling::DwarfCFI; +} + +X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple) + : X86MCAsmInfoDarwin(Triple) { +} + +void X86ELFMCAsmInfo::anchor() { } + +X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) { + bool is64Bit = T.getArch() == Triple::x86_64; + bool isX32 = T.getEnvironment() == Triple::GNUX32; + + // For ELF, x86-64 pointer size depends on the ABI. + // For x86-64 without the x32 ABI, pointer size is 8. For x86 and for x86-64 + // with the x32 ABI, pointer size remains the default 4. + PointerSize = (is64Bit && !isX32) ? 8 : 4; + + // OTOH, stack slot size is always 8 for x86-64, even with the x32 ABI. + CalleeSaveStackSlotSize = is64Bit ? 8 : 4; + + AssemblerDialect = AsmWriterFlavor; + + TextAlignFillValue = 0x90; + + PrivateGlobalPrefix = ".L"; + WeakRefDirective = "\t.weak\t"; + PCSymbol = "."; + + // Set up DWARF directives + HasLEB128 = true; // Target asm supports leb128 directives (little-endian) + + // Debug Information + SupportsDebugInformation = true; + + // Exceptions handling + ExceptionsType = ExceptionHandling::DwarfCFI; + + // OpenBSD and Bitrig have buggy support for .quad in 32-bit mode, just split + // into two .words. + if ((T.getOS() == Triple::OpenBSD || T.getOS() == Triple::Bitrig) && + T.getArch() == Triple::x86) + Data64bitsDirective = 0; +} + +const MCExpr * +X86_64MCAsmInfoDarwin::getExprForPersonalitySymbol(const MCSymbol *Sym, + unsigned Encoding, + MCStreamer &Streamer) const { + MCContext &Context = Streamer.getContext(); + const MCExpr *Res = + MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Context); + const MCExpr *Four = MCConstantExpr::Create(4, Context); + return MCBinaryExpr::CreateAdd(Res, Four, Context); +} + +const MCSection *X86ELFMCAsmInfo:: +getNonexecutableStackSection(MCContext &Ctx) const { + return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS, + 0, SectionKind::getMetadata()); +} + +void X86MCAsmInfoMicrosoft::anchor() { } + +X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) { + if (Triple.getArch() == Triple::x86_64) { + GlobalPrefix = ""; + PrivateGlobalPrefix = ".L"; + } + + AssemblerDialect = AsmWriterFlavor; + + TextAlignFillValue = 0x90; +} + +void X86MCAsmInfoGNUCOFF::anchor() { } + +X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) { + if (Triple.getArch() == Triple::x86_64) { + GlobalPrefix = ""; + PrivateGlobalPrefix = ".L"; + } + + AssemblerDialect = AsmWriterFlavor; + + TextAlignFillValue = 0x90; + + // Exceptions handling + ExceptionsType = ExceptionHandling::DwarfCFI; +} diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h new file mode 100644 index 0000000..b6b70fd --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h @@ -0,0 +1,58 @@ +//===-- X86MCAsmInfo.h - X86 asm properties --------------------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the X86MCAsmInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef X86TARGETASMINFO_H +#define X86TARGETASMINFO_H + +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAsmInfoCOFF.h" +#include "llvm/MC/MCAsmInfoDarwin.h" + +namespace llvm { + class Triple; + + class X86MCAsmInfoDarwin : public MCAsmInfoDarwin { + virtual void anchor(); + public: + explicit X86MCAsmInfoDarwin(const Triple &Triple); + }; + + struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin { + explicit X86_64MCAsmInfoDarwin(const Triple &Triple); + virtual const MCExpr * + getExprForPersonalitySymbol(const MCSymbol *Sym, + unsigned Encoding, + MCStreamer &Streamer) const; + }; + + class X86ELFMCAsmInfo : public MCAsmInfo { + virtual void anchor(); + public: + explicit X86ELFMCAsmInfo(const Triple &Triple); + virtual const MCSection *getNonexecutableStackSection(MCContext &Ctx) const; + }; + + class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft { + virtual void anchor(); + public: + explicit X86MCAsmInfoMicrosoft(const Triple &Triple); + }; + + class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF { + virtual void anchor(); + public: + explicit X86MCAsmInfoGNUCOFF(const Triple &Triple); + }; +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp new file mode 100644 index 0000000..016af71 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -0,0 +1,1239 @@ +//===-- X86MCCodeEmitter.cpp - Convert X86 code to machine code -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the X86MCCodeEmitter class. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "mccodeemitter" +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86FixupKinds.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { +class X86MCCodeEmitter : public MCCodeEmitter { + X86MCCodeEmitter(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION; + void operator=(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION; + const MCInstrInfo &MCII; + const MCSubtargetInfo &STI; + MCContext &Ctx; +public: + X86MCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti, + MCContext &ctx) + : MCII(mcii), STI(sti), Ctx(ctx) { + } + + ~X86MCCodeEmitter() {} + + bool is64BitMode() const { + // FIXME: Can tablegen auto-generate this? + return (STI.getFeatureBits() & X86::Mode64Bit) != 0; + } + + bool is32BitMode() const { + // FIXME: Can tablegen auto-generate this? + return (STI.getFeatureBits() & X86::Mode64Bit) == 0; + } + + unsigned GetX86RegNum(const MCOperand &MO) const { + return Ctx.getRegisterInfo().getEncodingValue(MO.getReg()) & 0x7; + } + + // On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range + // 0-7 and the difference between the 2 groups is given by the REX prefix. + // In the VEX prefix, registers are seen sequencially from 0-15 and encoded + // in 1's complement form, example: + // + // ModRM field => XMM9 => 1 + // VEX.VVVV => XMM9 => ~9 + // + // See table 4-35 of Intel AVX Programming Reference for details. + unsigned char getVEXRegisterEncoding(const MCInst &MI, + unsigned OpNum) const { + unsigned SrcReg = MI.getOperand(OpNum).getReg(); + unsigned SrcRegNum = GetX86RegNum(MI.getOperand(OpNum)); + if (X86II::isX86_64ExtendedReg(SrcReg)) + SrcRegNum |= 8; + + // The registers represented through VEX_VVVV should + // be encoded in 1's complement form. + return (~SrcRegNum) & 0xf; + } + + void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const { + OS << (char)C; + ++CurByte; + } + + void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte, + raw_ostream &OS) const { + // Output the constant in little endian byte order. + for (unsigned i = 0; i != Size; ++i) { + EmitByte(Val & 255, CurByte, OS); + Val >>= 8; + } + } + + void EmitImmediate(const MCOperand &Disp, SMLoc Loc, + unsigned ImmSize, MCFixupKind FixupKind, + unsigned &CurByte, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + int ImmOffset = 0) const; + + inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode, + unsigned RM) { + assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!"); + return RM | (RegOpcode << 3) | (Mod << 6); + } + + void EmitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld, + unsigned &CurByte, raw_ostream &OS) const { + EmitByte(ModRMByte(3, RegOpcodeFld, GetX86RegNum(ModRMReg)), CurByte, OS); + } + + void EmitSIBByte(unsigned SS, unsigned Index, unsigned Base, + unsigned &CurByte, raw_ostream &OS) const { + // SIB byte is in the same format as the ModRMByte. + EmitByte(ModRMByte(SS, Index, Base), CurByte, OS); + } + + + void EmitMemModRMByte(const MCInst &MI, unsigned Op, + unsigned RegOpcodeField, + uint64_t TSFlags, unsigned &CurByte, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups) const; + + void EncodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups) const; + + void EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand, + const MCInst &MI, const MCInstrDesc &Desc, + raw_ostream &OS) const; + + void EmitSegmentOverridePrefix(uint64_t TSFlags, unsigned &CurByte, + int MemOperand, const MCInst &MI, + raw_ostream &OS) const; + + void EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand, + const MCInst &MI, const MCInstrDesc &Desc, + raw_ostream &OS) const; +}; + +} // end anonymous namespace + + +MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + return new X86MCCodeEmitter(MCII, STI, Ctx); +} + +/// isDisp8 - Return true if this signed displacement fits in a 8-bit +/// sign-extended field. +static bool isDisp8(int Value) { + return Value == (signed char)Value; +} + +/// getImmFixupKind - Return the appropriate fixup kind to use for an immediate +/// in an instruction with the specified TSFlags. +static MCFixupKind getImmFixupKind(uint64_t TSFlags) { + unsigned Size = X86II::getSizeOfImm(TSFlags); + bool isPCRel = X86II::isImmPCRel(TSFlags); + + return MCFixup::getKindForSize(Size, isPCRel); +} + +/// Is32BitMemOperand - Return true if the specified instruction has +/// a 32-bit memory operand. Op specifies the operand # of the memoperand. +static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) { + const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); + const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); + + if ((BaseReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) || + (IndexReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg()))) + return true; + return false; +} + +/// Is64BitMemOperand - Return true if the specified instruction has +/// a 64-bit memory operand. Op specifies the operand # of the memoperand. +#ifndef NDEBUG +static bool Is64BitMemOperand(const MCInst &MI, unsigned Op) { + const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); + const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); + + if ((BaseReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) || + (IndexReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg()))) + return true; + return false; +} +#endif + +/// Is16BitMemOperand - Return true if the specified instruction has +/// a 16-bit memory operand. Op specifies the operand # of the memoperand. +static bool Is16BitMemOperand(const MCInst &MI, unsigned Op) { + const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); + const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); + + if ((BaseReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) || + (IndexReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg()))) + return true; + return false; +} + +/// StartsWithGlobalOffsetTable - Check if this expression starts with +/// _GLOBAL_OFFSET_TABLE_ and if it is of the form +/// _GLOBAL_OFFSET_TABLE_-symbol. This is needed to support PIC on ELF +/// i386 as _GLOBAL_OFFSET_TABLE_ is magical. We check only simple case that +/// are know to be used: _GLOBAL_OFFSET_TABLE_ by itself or at the start +/// of a binary expression. +enum GlobalOffsetTableExprKind { + GOT_None, + GOT_Normal, + GOT_SymDiff +}; +static GlobalOffsetTableExprKind +StartsWithGlobalOffsetTable(const MCExpr *Expr) { + const MCExpr *RHS = 0; + if (Expr->getKind() == MCExpr::Binary) { + const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Expr); + Expr = BE->getLHS(); + RHS = BE->getRHS(); + } + + if (Expr->getKind() != MCExpr::SymbolRef) + return GOT_None; + + const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr); + const MCSymbol &S = Ref->getSymbol(); + if (S.getName() != "_GLOBAL_OFFSET_TABLE_") + return GOT_None; + if (RHS && RHS->getKind() == MCExpr::SymbolRef) + return GOT_SymDiff; + return GOT_Normal; +} + +static bool HasSecRelSymbolRef(const MCExpr *Expr) { + if (Expr->getKind() == MCExpr::SymbolRef) { + const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr); + return Ref->getKind() == MCSymbolRefExpr::VK_SECREL; + } + return false; +} + +void X86MCCodeEmitter:: +EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size, + MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const { + const MCExpr *Expr = NULL; + if (DispOp.isImm()) { + // If this is a simple integer displacement that doesn't require a + // relocation, emit it now. + if (FixupKind != FK_PCRel_1 && + FixupKind != FK_PCRel_2 && + FixupKind != FK_PCRel_4) { + EmitConstant(DispOp.getImm()+ImmOffset, Size, CurByte, OS); + return; + } + Expr = MCConstantExpr::Create(DispOp.getImm(), Ctx); + } else { + Expr = DispOp.getExpr(); + } + + // If we have an immoffset, add it to the expression. + if ((FixupKind == FK_Data_4 || + FixupKind == FK_Data_8 || + FixupKind == MCFixupKind(X86::reloc_signed_4byte))) { + GlobalOffsetTableExprKind Kind = StartsWithGlobalOffsetTable(Expr); + if (Kind != GOT_None) { + assert(ImmOffset == 0); + + FixupKind = MCFixupKind(X86::reloc_global_offset_table); + if (Kind == GOT_Normal) + ImmOffset = CurByte; + } else if (Expr->getKind() == MCExpr::SymbolRef) { + if (HasSecRelSymbolRef(Expr)) { + FixupKind = MCFixupKind(FK_SecRel_4); + } + } else if (Expr->getKind() == MCExpr::Binary) { + const MCBinaryExpr *Bin = static_cast<const MCBinaryExpr*>(Expr); + if (HasSecRelSymbolRef(Bin->getLHS()) + || HasSecRelSymbolRef(Bin->getRHS())) { + FixupKind = MCFixupKind(FK_SecRel_4); + } + } + } + + // If the fixup is pc-relative, we need to bias the value to be relative to + // the start of the field, not the end of the field. + if (FixupKind == FK_PCRel_4 || + FixupKind == MCFixupKind(X86::reloc_riprel_4byte) || + FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load)) + ImmOffset -= 4; + if (FixupKind == FK_PCRel_2) + ImmOffset -= 2; + if (FixupKind == FK_PCRel_1) + ImmOffset -= 1; + + if (ImmOffset) + Expr = MCBinaryExpr::CreateAdd(Expr, MCConstantExpr::Create(ImmOffset, Ctx), + Ctx); + + // Emit a symbolic constant as a fixup and 4 zeros. + Fixups.push_back(MCFixup::Create(CurByte, Expr, FixupKind, Loc)); + EmitConstant(0, Size, CurByte, OS); +} + +void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, + unsigned RegOpcodeField, + uint64_t TSFlags, unsigned &CurByte, + raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups) const{ + const MCOperand &Disp = MI.getOperand(Op+X86::AddrDisp); + const MCOperand &Base = MI.getOperand(Op+X86::AddrBaseReg); + const MCOperand &Scale = MI.getOperand(Op+X86::AddrScaleAmt); + const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); + unsigned BaseReg = Base.getReg(); + + // Handle %rip relative addressing. + if (BaseReg == X86::RIP) { // [disp32+RIP] in X86-64 mode + assert(is64BitMode() && "Rip-relative addressing requires 64-bit mode"); + assert(IndexReg.getReg() == 0 && "Invalid rip-relative address"); + EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS); + + unsigned FixupKind = X86::reloc_riprel_4byte; + + // movq loads are handled with a special relocation form which allows the + // linker to eliminate some loads for GOT references which end up in the + // same linkage unit. + if (MI.getOpcode() == X86::MOV64rm) + FixupKind = X86::reloc_riprel_4byte_movq_load; + + // rip-relative addressing is actually relative to the *next* instruction. + // Since an immediate can follow the mod/rm byte for an instruction, this + // means that we need to bias the immediate field of the instruction with + // the size of the immediate field. If we have this case, add it into the + // expression to emit. + int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0; + + EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), + CurByte, OS, Fixups, -ImmSize); + return; + } + + unsigned BaseRegNo = BaseReg ? GetX86RegNum(Base) : -1U; + + // Determine whether a SIB byte is needed. + // If no BaseReg, issue a RIP relative instruction only if the MCE can + // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table + // 2-7) and absolute references. + + if (// The SIB byte must be used if there is an index register. + IndexReg.getReg() == 0 && + // The SIB byte must be used if the base is ESP/RSP/R12, all of which + // encode to an R/M value of 4, which indicates that a SIB byte is + // present. + BaseRegNo != N86::ESP && + // If there is no base register and we're in 64-bit mode, we need a SIB + // byte to emit an addr that is just 'disp32' (the non-RIP relative form). + (!is64BitMode() || BaseReg != 0)) { + + if (BaseReg == 0) { // [disp32] in X86-32 mode + EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS); + EmitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, CurByte, OS, Fixups); + return; + } + + // If the base is not EBP/ESP and there is no displacement, use simple + // indirect register encoding, this handles addresses like [EAX]. The + // encoding for [EBP] with no displacement means [disp32] so we handle it + // by emitting a displacement of 0 below. + if (Disp.isImm() && Disp.getImm() == 0 && BaseRegNo != N86::EBP) { + EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS); + return; + } + + // Otherwise, if the displacement fits in a byte, encode as [REG+disp8]. + if (Disp.isImm() && isDisp8(Disp.getImm())) { + EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS); + EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); + return; + } + + // Otherwise, emit the most general non-SIB encoding: [REG+disp32] + EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS); + EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS, + Fixups); + return; + } + + // We need a SIB byte, so start by outputting the ModR/M byte first + assert(IndexReg.getReg() != X86::ESP && + IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!"); + + bool ForceDisp32 = false; + bool ForceDisp8 = false; + if (BaseReg == 0) { + // If there is no base register, we emit the special case SIB byte with + // MOD=0, BASE=5, to JUST get the index, scale, and displacement. + EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS); + ForceDisp32 = true; + } else if (!Disp.isImm()) { + // Emit the normal disp32 encoding. + EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS); + ForceDisp32 = true; + } else if (Disp.getImm() == 0 && + // Base reg can't be anything that ends up with '5' as the base + // reg, it is the magic [*] nomenclature that indicates no base. + BaseRegNo != N86::EBP) { + // Emit no displacement ModR/M byte + EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS); + } else if (isDisp8(Disp.getImm())) { + // Emit the disp8 encoding. + EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS); + ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP + } else { + // Emit the normal disp32 encoding. + EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS); + } + + // Calculate what the SS field value should be... + static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 }; + unsigned SS = SSTable[Scale.getImm()]; + + if (BaseReg == 0) { + // Handle the SIB byte for the case where there is no base, see Intel + // Manual 2A, table 2-7. The displacement has already been output. + unsigned IndexRegNo; + if (IndexReg.getReg()) + IndexRegNo = GetX86RegNum(IndexReg); + else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5) + IndexRegNo = 4; + EmitSIBByte(SS, IndexRegNo, 5, CurByte, OS); + } else { + unsigned IndexRegNo; + if (IndexReg.getReg()) + IndexRegNo = GetX86RegNum(IndexReg); + else + IndexRegNo = 4; // For example [ESP+1*<noreg>+4] + EmitSIBByte(SS, IndexRegNo, GetX86RegNum(Base), CurByte, OS); + } + + // Do we need to output a displacement? + if (ForceDisp8) + EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); + else if (ForceDisp32 || Disp.getImm() != 0) + EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), + CurByte, OS, Fixups); +} + +/// EmitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix +/// called VEX. +void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, + int MemOperand, const MCInst &MI, + const MCInstrDesc &Desc, + raw_ostream &OS) const { + bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; + bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3; + bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; + + // VEX_R: opcode externsion equivalent to REX.R in + // 1's complement (inverted) form + // + // 1: Same as REX_R=0 (must be 1 in 32-bit mode) + // 0: Same as REX_R=1 (64 bit mode only) + // + unsigned char VEX_R = 0x1; + + // VEX_X: equivalent to REX.X, only used when a + // register is used for index in SIB Byte. + // + // 1: Same as REX.X=0 (must be 1 in 32-bit mode) + // 0: Same as REX.X=1 (64-bit mode only) + unsigned char VEX_X = 0x1; + + // VEX_B: + // + // 1: Same as REX_B=0 (ignored in 32-bit mode) + // 0: Same as REX_B=1 (64 bit mode only) + // + unsigned char VEX_B = 0x1; + + // VEX_W: opcode specific (use like REX.W, or used for + // opcode extension, or ignored, depending on the opcode byte) + unsigned char VEX_W = 0; + + // XOP: Use XOP prefix byte 0x8f instead of VEX. + unsigned char XOP = 0; + + // VEX_5M (VEX m-mmmmm field): + // + // 0b00000: Reserved for future use + // 0b00001: implied 0F leading opcode + // 0b00010: implied 0F 38 leading opcode bytes + // 0b00011: implied 0F 3A leading opcode bytes + // 0b00100-0b11111: Reserved for future use + // 0b01000: XOP map select - 08h instructions with imm byte + // 0b10001: XOP map select - 09h instructions with no imm byte + unsigned char VEX_5M = 0x1; + + // VEX_4V (VEX vvvv field): a register specifier + // (in 1's complement form) or 1111 if unused. + unsigned char VEX_4V = 0xf; + + // VEX_L (Vector Length): + // + // 0: scalar or 128-bit vector + // 1: 256-bit vector + // + unsigned char VEX_L = 0; + + // VEX_PP: opcode extension providing equivalent + // functionality of a SIMD prefix + // + // 0b00: None + // 0b01: 66 + // 0b10: F3 + // 0b11: F2 + // + unsigned char VEX_PP = 0; + + // Encode the operand size opcode prefix as needed. + if (TSFlags & X86II::OpSize) + VEX_PP = 0x01; + + if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W) + VEX_W = 1; + + if ((TSFlags >> X86II::VEXShift) & X86II::XOP) + XOP = 1; + + if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) + VEX_L = 1; + + switch (TSFlags & X86II::Op0Mask) { + default: llvm_unreachable("Invalid prefix!"); + case X86II::T8: // 0F 38 + VEX_5M = 0x2; + break; + case X86II::TA: // 0F 3A + VEX_5M = 0x3; + break; + case X86II::T8XS: // F3 0F 38 + VEX_PP = 0x2; + VEX_5M = 0x2; + break; + case X86II::T8XD: // F2 0F 38 + VEX_PP = 0x3; + VEX_5M = 0x2; + break; + case X86II::TAXD: // F2 0F 3A + VEX_PP = 0x3; + VEX_5M = 0x3; + break; + case X86II::XS: // F3 0F + VEX_PP = 0x2; + break; + case X86II::XD: // F2 0F + VEX_PP = 0x3; + break; + case X86II::XOP8: + VEX_5M = 0x8; + break; + case X86II::XOP9: + VEX_5M = 0x9; + break; + case X86II::A6: // Bypass: Not used by VEX + case X86II::A7: // Bypass: Not used by VEX + case X86II::TB: // Bypass: Not used by VEX + case 0: + break; // No prefix! + } + + + // Classify VEX_B, VEX_4V, VEX_R, VEX_X + unsigned NumOps = Desc.getNumOperands(); + unsigned CurOp = 0; + if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0) + ++CurOp; + else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) { + assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1); + // Special case for GATHER with 2 TIED_TO operands + // Skip the first 2 operands: dst, mask_wb + CurOp += 2; + } + + switch (TSFlags & X86II::FormMask) { + case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this!"); + case X86II::MRMDestMem: { + // MRMDestMem instructions forms: + // MemAddr, src1(ModR/M) + // MemAddr, src1(VEX_4V), src2(ModR/M) + // MemAddr, src1(ModR/M), imm8 + // + if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg())) + VEX_B = 0x0; + if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg())) + VEX_X = 0x0; + + CurOp = X86::AddrNumOperands; + if (HasVEX_4V) + VEX_4V = getVEXRegisterEncoding(MI, CurOp++); + + const MCOperand &MO = MI.getOperand(CurOp); + if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg())) + VEX_R = 0x0; + break; + } + case X86II::MRMSrcMem: + // MRMSrcMem instructions forms: + // src1(ModR/M), MemAddr + // src1(ModR/M), src2(VEX_4V), MemAddr + // src1(ModR/M), MemAddr, imm8 + // src1(ModR/M), MemAddr, src2(VEX_I8IMM) + // + // FMA4: + // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) + // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M), + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp++).getReg())) + VEX_R = 0x0; + + if (HasVEX_4V) + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + + if (X86II::isX86_64ExtendedReg( + MI.getOperand(MemOperand+X86::AddrBaseReg).getReg())) + VEX_B = 0x0; + if (X86II::isX86_64ExtendedReg( + MI.getOperand(MemOperand+X86::AddrIndexReg).getReg())) + VEX_X = 0x0; + + if (HasVEX_4VOp3) + // Instruction format for 4VOp3: + // src1(ModR/M), MemAddr, src3(VEX_4V) + // CurOp points to start of the MemoryOperand, + // it skips TIED_TO operands if exist, then increments past src1. + // CurOp + X86::AddrNumOperands will point to src3. + VEX_4V = getVEXRegisterEncoding(MI, CurOp+X86::AddrNumOperands); + break; + case X86II::MRM0m: case X86II::MRM1m: + case X86II::MRM2m: case X86II::MRM3m: + case X86II::MRM4m: case X86II::MRM5m: + case X86II::MRM6m: case X86II::MRM7m: { + // MRM[0-9]m instructions forms: + // MemAddr + // src1(VEX_4V), MemAddr + if (HasVEX_4V) + VEX_4V = getVEXRegisterEncoding(MI, 0); + + if (X86II::isX86_64ExtendedReg( + MI.getOperand(MemOperand+X86::AddrBaseReg).getReg())) + VEX_B = 0x0; + if (X86II::isX86_64ExtendedReg( + MI.getOperand(MemOperand+X86::AddrIndexReg).getReg())) + VEX_X = 0x0; + break; + } + case X86II::MRMSrcReg: + // MRMSrcReg instructions forms: + // dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) + // dst(ModR/M), src1(ModR/M) + // dst(ModR/M), src1(ModR/M), imm8 + // + // FMA4: + // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) + // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M), + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_R = 0x0; + CurOp++; + + if (HasVEX_4V) + VEX_4V = getVEXRegisterEncoding(MI, CurOp++); + + if (HasMemOp4) // Skip second register source (encoded in I8IMM) + CurOp++; + + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_B = 0x0; + CurOp++; + if (HasVEX_4VOp3) + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + break; + case X86II::MRMDestReg: + // MRMDestReg instructions forms: + // dst(ModR/M), src(ModR/M) + // dst(ModR/M), src(ModR/M), imm8 + // dst(ModR/M), src1(VEX_4V), src2(ModR/M) + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_B = 0x0; + CurOp++; + + if (HasVEX_4V) + VEX_4V = getVEXRegisterEncoding(MI, CurOp++); + + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_R = 0x0; + break; + case X86II::MRM0r: case X86II::MRM1r: + case X86II::MRM2r: case X86II::MRM3r: + case X86II::MRM4r: case X86II::MRM5r: + case X86II::MRM6r: case X86II::MRM7r: + // MRM0r-MRM7r instructions forms: + // dst(VEX_4V), src(ModR/M), imm8 + VEX_4V = getVEXRegisterEncoding(MI, 0); + if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg())) + VEX_B = 0x0; + break; + default: // RawFrm + break; + } + + // Emit segment override opcode prefix as needed. + EmitSegmentOverridePrefix(TSFlags, CurByte, MemOperand, MI, OS); + + // VEX opcode prefix can have 2 or 3 bytes + // + // 3 bytes: + // +-----+ +--------------+ +-------------------+ + // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp | + // +-----+ +--------------+ +-------------------+ + // 2 bytes: + // +-----+ +-------------------+ + // | C5h | | R | vvvv | L | pp | + // +-----+ +-------------------+ + // + unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3); + + if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix + EmitByte(0xC5, CurByte, OS); + EmitByte(LastByte | (VEX_R << 7), CurByte, OS); + return; + } + + // 3 byte VEX prefix + EmitByte(XOP ? 0x8F : 0xC4, CurByte, OS); + EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS); + EmitByte(LastByte | (VEX_W << 7), CurByte, OS); +} + +/// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64 +/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand +/// size, and 3) use of X86-64 extended registers. +static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, + const MCInstrDesc &Desc) { + unsigned REX = 0; + if (TSFlags & X86II::REX_W) + REX |= 1 << 3; // set REX.W + + if (MI.getNumOperands() == 0) return REX; + + unsigned NumOps = MI.getNumOperands(); + // FIXME: MCInst should explicitize the two-addrness. + bool isTwoAddr = NumOps > 1 && + Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1; + + // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix. + unsigned i = isTwoAddr ? 1 : 0; + for (; i != NumOps; ++i) { + const MCOperand &MO = MI.getOperand(i); + if (!MO.isReg()) continue; + unsigned Reg = MO.getReg(); + if (!X86II::isX86_64NonExtLowByteReg(Reg)) continue; + // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything + // that returns non-zero. + REX |= 0x40; // REX fixed encoding prefix + break; + } + + switch (TSFlags & X86II::FormMask) { + case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this!"); + case X86II::MRMSrcReg: + if (MI.getOperand(0).isReg() && + X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg())) + REX |= 1 << 2; // set REX.R + i = isTwoAddr ? 2 : 1; + for (; i != NumOps; ++i) { + const MCOperand &MO = MI.getOperand(i); + if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg())) + REX |= 1 << 0; // set REX.B + } + break; + case X86II::MRMSrcMem: { + if (MI.getOperand(0).isReg() && + X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg())) + REX |= 1 << 2; // set REX.R + unsigned Bit = 0; + i = isTwoAddr ? 2 : 1; + for (; i != NumOps; ++i) { + const MCOperand &MO = MI.getOperand(i); + if (MO.isReg()) { + if (X86II::isX86_64ExtendedReg(MO.getReg())) + REX |= 1 << Bit; // set REX.B (Bit=0) and REX.X (Bit=1) + Bit++; + } + } + break; + } + case X86II::MRM0m: case X86II::MRM1m: + case X86II::MRM2m: case X86II::MRM3m: + case X86II::MRM4m: case X86II::MRM5m: + case X86II::MRM6m: case X86II::MRM7m: + case X86II::MRMDestMem: { + unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands); + i = isTwoAddr ? 1 : 0; + if (NumOps > e && MI.getOperand(e).isReg() && + X86II::isX86_64ExtendedReg(MI.getOperand(e).getReg())) + REX |= 1 << 2; // set REX.R + unsigned Bit = 0; + for (; i != e; ++i) { + const MCOperand &MO = MI.getOperand(i); + if (MO.isReg()) { + if (X86II::isX86_64ExtendedReg(MO.getReg())) + REX |= 1 << Bit; // REX.B (Bit=0) and REX.X (Bit=1) + Bit++; + } + } + break; + } + default: + if (MI.getOperand(0).isReg() && + X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg())) + REX |= 1 << 0; // set REX.B + i = isTwoAddr ? 2 : 1; + for (unsigned e = NumOps; i != e; ++i) { + const MCOperand &MO = MI.getOperand(i); + if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg())) + REX |= 1 << 2; // set REX.R + } + break; + } + return REX; +} + +/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed +void X86MCCodeEmitter::EmitSegmentOverridePrefix(uint64_t TSFlags, + unsigned &CurByte, int MemOperand, + const MCInst &MI, + raw_ostream &OS) const { + switch (TSFlags & X86II::SegOvrMask) { + default: llvm_unreachable("Invalid segment!"); + case 0: + // No segment override, check for explicit one on memory operand. + if (MemOperand != -1) { // If the instruction has a memory operand. + switch (MI.getOperand(MemOperand+X86::AddrSegmentReg).getReg()) { + default: llvm_unreachable("Unknown segment register!"); + case 0: break; + case X86::CS: EmitByte(0x2E, CurByte, OS); break; + case X86::SS: EmitByte(0x36, CurByte, OS); break; + case X86::DS: EmitByte(0x3E, CurByte, OS); break; + case X86::ES: EmitByte(0x26, CurByte, OS); break; + case X86::FS: EmitByte(0x64, CurByte, OS); break; + case X86::GS: EmitByte(0x65, CurByte, OS); break; + } + } + break; + case X86II::FS: + EmitByte(0x64, CurByte, OS); + break; + case X86II::GS: + EmitByte(0x65, CurByte, OS); + break; + } +} + +/// EmitOpcodePrefix - Emit all instruction prefixes prior to the opcode. +/// +/// MemOperand is the operand # of the start of a memory operand if present. If +/// Not present, it is -1. +void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, + int MemOperand, const MCInst &MI, + const MCInstrDesc &Desc, + raw_ostream &OS) const { + + // Emit the lock opcode prefix as needed. + if (TSFlags & X86II::LOCK) + EmitByte(0xF0, CurByte, OS); + + // Emit segment override opcode prefix as needed. + EmitSegmentOverridePrefix(TSFlags, CurByte, MemOperand, MI, OS); + + // Emit the repeat opcode prefix as needed. + if ((TSFlags & X86II::Op0Mask) == X86II::REP) + EmitByte(0xF3, CurByte, OS); + + // Emit the address size opcode prefix as needed. + bool need_address_override; + if (TSFlags & X86II::AdSize) { + need_address_override = true; + } else if (MemOperand == -1) { + need_address_override = false; + } else if (is64BitMode()) { + assert(!Is16BitMemOperand(MI, MemOperand)); + need_address_override = Is32BitMemOperand(MI, MemOperand); + } else if (is32BitMode()) { + assert(!Is64BitMemOperand(MI, MemOperand)); + need_address_override = Is16BitMemOperand(MI, MemOperand); + } else { + need_address_override = false; + } + + if (need_address_override) + EmitByte(0x67, CurByte, OS); + + // Emit the operand size opcode prefix as needed. + if (TSFlags & X86II::OpSize) + EmitByte(0x66, CurByte, OS); + + bool Need0FPrefix = false; + switch (TSFlags & X86II::Op0Mask) { + default: llvm_unreachable("Invalid prefix!"); + case 0: break; // No prefix! + case X86II::REP: break; // already handled. + case X86II::TB: // Two-byte opcode prefix + case X86II::T8: // 0F 38 + case X86II::TA: // 0F 3A + case X86II::A6: // 0F A6 + case X86II::A7: // 0F A7 + Need0FPrefix = true; + break; + case X86II::T8XS: // F3 0F 38 + EmitByte(0xF3, CurByte, OS); + Need0FPrefix = true; + break; + case X86II::T8XD: // F2 0F 38 + EmitByte(0xF2, CurByte, OS); + Need0FPrefix = true; + break; + case X86II::TAXD: // F2 0F 3A + EmitByte(0xF2, CurByte, OS); + Need0FPrefix = true; + break; + case X86II::XS: // F3 0F + EmitByte(0xF3, CurByte, OS); + Need0FPrefix = true; + break; + case X86II::XD: // F2 0F + EmitByte(0xF2, CurByte, OS); + Need0FPrefix = true; + break; + case X86II::D8: EmitByte(0xD8, CurByte, OS); break; + case X86II::D9: EmitByte(0xD9, CurByte, OS); break; + case X86II::DA: EmitByte(0xDA, CurByte, OS); break; + case X86II::DB: EmitByte(0xDB, CurByte, OS); break; + case X86II::DC: EmitByte(0xDC, CurByte, OS); break; + case X86II::DD: EmitByte(0xDD, CurByte, OS); break; + case X86II::DE: EmitByte(0xDE, CurByte, OS); break; + case X86II::DF: EmitByte(0xDF, CurByte, OS); break; + } + + // Handle REX prefix. + // FIXME: Can this come before F2 etc to simplify emission? + if (is64BitMode()) { + if (unsigned REX = DetermineREXPrefix(MI, TSFlags, Desc)) + EmitByte(0x40 | REX, CurByte, OS); + } + + // 0x0F escape code must be emitted just before the opcode. + if (Need0FPrefix) + EmitByte(0x0F, CurByte, OS); + + // FIXME: Pull this up into previous switch if REX can be moved earlier. + switch (TSFlags & X86II::Op0Mask) { + case X86II::T8XS: // F3 0F 38 + case X86II::T8XD: // F2 0F 38 + case X86II::T8: // 0F 38 + EmitByte(0x38, CurByte, OS); + break; + case X86II::TAXD: // F2 0F 3A + case X86II::TA: // 0F 3A + EmitByte(0x3A, CurByte, OS); + break; + case X86II::A6: // 0F A6 + EmitByte(0xA6, CurByte, OS); + break; + case X86II::A7: // 0F A7 + EmitByte(0xA7, CurByte, OS); + break; + } +} + +void X86MCCodeEmitter:: +EncodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups) const { + unsigned Opcode = MI.getOpcode(); + const MCInstrDesc &Desc = MCII.get(Opcode); + uint64_t TSFlags = Desc.TSFlags; + + // Pseudo instructions don't get encoded. + if ((TSFlags & X86II::FormMask) == X86II::Pseudo) + return; + + unsigned NumOps = Desc.getNumOperands(); + unsigned CurOp = X86II::getOperandBias(Desc); + + // Keep track of the current byte being emitted. + unsigned CurByte = 0; + + // Is this instruction encoded using the AVX VEX prefix? + bool HasVEXPrefix = (TSFlags >> X86II::VEXShift) & X86II::VEX; + + // It uses the VEX.VVVV field? + bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; + bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3; + bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; + const unsigned MemOp4_I8IMMOperand = 2; + + // Determine where the memory operand starts, if present. + int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode); + if (MemoryOperand != -1) MemoryOperand += CurOp; + + if (!HasVEXPrefix) + EmitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS); + else + EmitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS); + + unsigned char BaseOpcode = X86II::getBaseOpcodeFor(TSFlags); + + if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode) + BaseOpcode = 0x0F; // Weird 3DNow! encoding. + + unsigned SrcRegNum = 0; + switch (TSFlags & X86II::FormMask) { + case X86II::MRMInitReg: + llvm_unreachable("FIXME: Remove this form when the JIT moves to MCCodeEmitter!"); + default: errs() << "FORM: " << (TSFlags & X86II::FormMask) << "\n"; + llvm_unreachable("Unknown FormMask value in X86MCCodeEmitter!"); + case X86II::Pseudo: + llvm_unreachable("Pseudo instruction shouldn't be emitted"); + case X86II::RawFrm: + EmitByte(BaseOpcode, CurByte, OS); + break; + case X86II::RawFrmImm8: + EmitByte(BaseOpcode, CurByte, OS); + EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), + X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags), + CurByte, OS, Fixups); + EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, CurByte, + OS, Fixups); + break; + case X86II::RawFrmImm16: + EmitByte(BaseOpcode, CurByte, OS); + EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), + X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags), + CurByte, OS, Fixups); + EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, CurByte, + OS, Fixups); + break; + + case X86II::AddRegFrm: + EmitByte(BaseOpcode + GetX86RegNum(MI.getOperand(CurOp++)), CurByte, OS); + break; + + case X86II::MRMDestReg: + EmitByte(BaseOpcode, CurByte, OS); + SrcRegNum = CurOp + 1; + + if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) + ++SrcRegNum; + + EmitRegModRMByte(MI.getOperand(CurOp), + GetX86RegNum(MI.getOperand(SrcRegNum)), CurByte, OS); + CurOp = SrcRegNum + 1; + break; + + case X86II::MRMDestMem: + EmitByte(BaseOpcode, CurByte, OS); + SrcRegNum = CurOp + X86::AddrNumOperands; + + if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) + ++SrcRegNum; + + EmitMemModRMByte(MI, CurOp, + GetX86RegNum(MI.getOperand(SrcRegNum)), + TSFlags, CurByte, OS, Fixups); + CurOp = SrcRegNum + 1; + break; + + case X86II::MRMSrcReg: + EmitByte(BaseOpcode, CurByte, OS); + SrcRegNum = CurOp + 1; + + if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) + ++SrcRegNum; + + if (HasMemOp4) // Skip 2nd src (which is encoded in I8IMM) + ++SrcRegNum; + + EmitRegModRMByte(MI.getOperand(SrcRegNum), + GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS); + + // 2 operands skipped with HasMemOp4, compensate accordingly + CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1; + if (HasVEX_4VOp3) + ++CurOp; + break; + + case X86II::MRMSrcMem: { + int AddrOperands = X86::AddrNumOperands; + unsigned FirstMemOp = CurOp+1; + if (HasVEX_4V) { + ++AddrOperands; + ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV). + } + if (HasMemOp4) // Skip second register source (encoded in I8IMM) + ++FirstMemOp; + + EmitByte(BaseOpcode, CurByte, OS); + + EmitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)), + TSFlags, CurByte, OS, Fixups); + CurOp += AddrOperands + 1; + if (HasVEX_4VOp3) + ++CurOp; + break; + } + + case X86II::MRM0r: case X86II::MRM1r: + case X86II::MRM2r: case X86II::MRM3r: + case X86II::MRM4r: case X86II::MRM5r: + case X86II::MRM6r: case X86II::MRM7r: + if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV). + ++CurOp; + EmitByte(BaseOpcode, CurByte, OS); + EmitRegModRMByte(MI.getOperand(CurOp++), + (TSFlags & X86II::FormMask)-X86II::MRM0r, + CurByte, OS); + break; + case X86II::MRM0m: case X86II::MRM1m: + case X86II::MRM2m: case X86II::MRM3m: + case X86II::MRM4m: case X86II::MRM5m: + case X86II::MRM6m: case X86II::MRM7m: + if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV). + ++CurOp; + EmitByte(BaseOpcode, CurByte, OS); + EmitMemModRMByte(MI, CurOp, (TSFlags & X86II::FormMask)-X86II::MRM0m, + TSFlags, CurByte, OS, Fixups); + CurOp += X86::AddrNumOperands; + break; + case X86II::MRM_C1: case X86II::MRM_C2: case X86II::MRM_C3: + case X86II::MRM_C4: case X86II::MRM_C8: case X86II::MRM_C9: + case X86II::MRM_CA: case X86II::MRM_CB: case X86II::MRM_D0: + case X86II::MRM_D1: case X86II::MRM_D4: case X86II::MRM_D5: + case X86II::MRM_D6: case X86II::MRM_D8: case X86II::MRM_D9: + case X86II::MRM_DA: case X86II::MRM_DB: case X86II::MRM_DC: + case X86II::MRM_DD: case X86II::MRM_DE: case X86II::MRM_DF: + case X86II::MRM_E8: case X86II::MRM_F0: case X86II::MRM_F8: + case X86II::MRM_F9: + EmitByte(BaseOpcode, CurByte, OS); + + unsigned char MRM; + switch (TSFlags & X86II::FormMask) { + default: llvm_unreachable("Invalid Form"); + case X86II::MRM_C1: MRM = 0xC1; break; + case X86II::MRM_C2: MRM = 0xC2; break; + case X86II::MRM_C3: MRM = 0xC3; break; + case X86II::MRM_C4: MRM = 0xC4; break; + case X86II::MRM_C8: MRM = 0xC8; break; + case X86II::MRM_C9: MRM = 0xC9; break; + case X86II::MRM_CA: MRM = 0xCA; break; + case X86II::MRM_CB: MRM = 0xCB; break; + case X86II::MRM_D0: MRM = 0xD0; break; + case X86II::MRM_D1: MRM = 0xD1; break; + case X86II::MRM_D4: MRM = 0xD4; break; + case X86II::MRM_D5: MRM = 0xD5; break; + case X86II::MRM_D6: MRM = 0xD6; break; + case X86II::MRM_D8: MRM = 0xD8; break; + case X86II::MRM_D9: MRM = 0xD9; break; + case X86II::MRM_DA: MRM = 0xDA; break; + case X86II::MRM_DB: MRM = 0xDB; break; + case X86II::MRM_DC: MRM = 0xDC; break; + case X86II::MRM_DD: MRM = 0xDD; break; + case X86II::MRM_DE: MRM = 0xDE; break; + case X86II::MRM_DF: MRM = 0xDF; break; + case X86II::MRM_E8: MRM = 0xE8; break; + case X86II::MRM_F0: MRM = 0xF0; break; + case X86II::MRM_F8: MRM = 0xF8; break; + case X86II::MRM_F9: MRM = 0xF9; break; + } + EmitByte(MRM, CurByte, OS); + break; + } + + // If there is a remaining operand, it must be a trailing immediate. Emit it + // according to the right size for the instruction. Some instructions + // (SSE4a extrq and insertq) have two trailing immediates. + while (CurOp != NumOps && NumOps - CurOp <= 2) { + // The last source register of a 4 operand instruction in AVX is encoded + // in bits[7:4] of a immediate byte. + if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) { + const MCOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand + : CurOp); + ++CurOp; + unsigned RegNum = GetX86RegNum(MO) << 4; + if (X86II::isX86_64ExtendedReg(MO.getReg())) + RegNum |= 1 << 7; + // If there is an additional 5th operand it must be an immediate, which + // is encoded in bits[3:0] + if (CurOp != NumOps) { + const MCOperand &MIMM = MI.getOperand(CurOp++); + if (MIMM.isImm()) { + unsigned Val = MIMM.getImm(); + assert(Val < 16 && "Immediate operand value out of range"); + RegNum |= Val; + } + } + EmitImmediate(MCOperand::CreateImm(RegNum), MI.getLoc(), 1, FK_Data_1, + CurByte, OS, Fixups); + } else { + unsigned FixupKind; + // FIXME: Is there a better way to know that we need a signed relocation? + if (MI.getOpcode() == X86::ADD64ri32 || + MI.getOpcode() == X86::MOV64ri32 || + MI.getOpcode() == X86::MOV64mi32 || + MI.getOpcode() == X86::PUSH64i32) + FixupKind = X86::reloc_signed_4byte; + else + FixupKind = getImmFixupKind(TSFlags); + EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), + X86II::getSizeOfImm(TSFlags), MCFixupKind(FixupKind), + CurByte, OS, Fixups); + } + } + + if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode) + EmitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS); + +#ifndef NDEBUG + // FIXME: Verify. + if (/*!Desc.isVariadic() &&*/ CurOp != NumOps) { + errs() << "Cannot encode all operands of: "; + MI.dump(); + errs() << '\n'; + abort(); + } +#endif +} diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp new file mode 100644 index 0000000..5e84530 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -0,0 +1,442 @@ +//===-- X86MCTargetDesc.cpp - X86 Target Descriptions ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides X86 specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#include "X86MCTargetDesc.h" +#include "InstPrinter/X86ATTInstPrinter.h" +#include "InstPrinter/X86IntelInstPrinter.h" +#include "X86MCAsmInfo.h" +#include "llvm/ADT/Triple.h" +#include "llvm/MC/MCCodeGenInfo.h" +#include "llvm/MC/MCInstrAnalysis.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MachineLocation.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Host.h" +#include "llvm/Support/TargetRegistry.h" + +#define GET_REGINFO_MC_DESC +#include "X86GenRegisterInfo.inc" + +#define GET_INSTRINFO_MC_DESC +#include "X86GenInstrInfo.inc" + +#define GET_SUBTARGETINFO_MC_DESC +#include "X86GenSubtargetInfo.inc" + +#if _MSC_VER +#include <intrin.h> +#endif + +using namespace llvm; + + +std::string X86_MC::ParseX86Triple(StringRef TT) { + Triple TheTriple(TT); + std::string FS; + if (TheTriple.getArch() == Triple::x86_64) + FS = "+64bit-mode"; + else + FS = "-64bit-mode"; + return FS; +} + +/// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in the +/// specified arguments. If we can't run cpuid on the host, return true. +bool X86_MC::GetCpuIDAndInfo(unsigned value, unsigned *rEAX, + unsigned *rEBX, unsigned *rECX, unsigned *rEDX) { +#if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64) + #if defined(__GNUC__) + // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually. + asm ("movq\t%%rbx, %%rsi\n\t" + "cpuid\n\t" + "xchgq\t%%rbx, %%rsi\n\t" + : "=a" (*rEAX), + "=S" (*rEBX), + "=c" (*rECX), + "=d" (*rEDX) + : "a" (value)); + return false; + #elif defined(_MSC_VER) + int registers[4]; + __cpuid(registers, value); + *rEAX = registers[0]; + *rEBX = registers[1]; + *rECX = registers[2]; + *rEDX = registers[3]; + return false; + #else + return true; + #endif +#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86) + #if defined(__GNUC__) + asm ("movl\t%%ebx, %%esi\n\t" + "cpuid\n\t" + "xchgl\t%%ebx, %%esi\n\t" + : "=a" (*rEAX), + "=S" (*rEBX), + "=c" (*rECX), + "=d" (*rEDX) + : "a" (value)); + return false; + #elif defined(_MSC_VER) + __asm { + mov eax,value + cpuid + mov esi,rEAX + mov dword ptr [esi],eax + mov esi,rEBX + mov dword ptr [esi],ebx + mov esi,rECX + mov dword ptr [esi],ecx + mov esi,rEDX + mov dword ptr [esi],edx + } + return false; + #else + return true; + #endif +#else + return true; +#endif +} + +/// GetCpuIDAndInfoEx - Execute the specified cpuid with subleaf and return the +/// 4 values in the specified arguments. If we can't run cpuid on the host, +/// return true. +bool X86_MC::GetCpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX, + unsigned *rEBX, unsigned *rECX, unsigned *rEDX) { +#if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64) + #if defined(__GNUC__) + // gcc desn't know cpuid would clobber ebx/rbx. Preseve it manually. + asm ("movq\t%%rbx, %%rsi\n\t" + "cpuid\n\t" + "xchgq\t%%rbx, %%rsi\n\t" + : "=a" (*rEAX), + "=S" (*rEBX), + "=c" (*rECX), + "=d" (*rEDX) + : "a" (value), + "c" (subleaf)); + return false; + #elif defined(_MSC_VER) + // __cpuidex was added in MSVC++ 9.0 SP1 + #if (_MSC_VER > 1500) || (_MSC_VER == 1500 && _MSC_FULL_VER >= 150030729) + int registers[4]; + __cpuidex(registers, value, subleaf); + *rEAX = registers[0]; + *rEBX = registers[1]; + *rECX = registers[2]; + *rEDX = registers[3]; + return false; + #else + return true; + #endif + #else + return true; + #endif +#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86) + #if defined(__GNUC__) + asm ("movl\t%%ebx, %%esi\n\t" + "cpuid\n\t" + "xchgl\t%%ebx, %%esi\n\t" + : "=a" (*rEAX), + "=S" (*rEBX), + "=c" (*rECX), + "=d" (*rEDX) + : "a" (value), + "c" (subleaf)); + return false; + #elif defined(_MSC_VER) + __asm { + mov eax,value + mov ecx,subleaf + cpuid + mov esi,rEAX + mov dword ptr [esi],eax + mov esi,rEBX + mov dword ptr [esi],ebx + mov esi,rECX + mov dword ptr [esi],ecx + mov esi,rEDX + mov dword ptr [esi],edx + } + return false; + #else + return true; + #endif +#else + return true; +#endif +} + +void X86_MC::DetectFamilyModel(unsigned EAX, unsigned &Family, + unsigned &Model) { + Family = (EAX >> 8) & 0xf; // Bits 8 - 11 + Model = (EAX >> 4) & 0xf; // Bits 4 - 7 + if (Family == 6 || Family == 0xf) { + if (Family == 0xf) + // Examine extended family ID if family ID is F. + Family += (EAX >> 20) & 0xff; // Bits 20 - 27 + // Examine extended model ID if family ID is 6 or F. + Model += ((EAX >> 16) & 0xf) << 4; // Bits 16 - 19 + } +} + +unsigned X86_MC::getDwarfRegFlavour(StringRef TT, bool isEH) { + Triple TheTriple(TT); + if (TheTriple.getArch() == Triple::x86_64) + return DWARFFlavour::X86_64; + + if (TheTriple.isOSDarwin()) + return isEH ? DWARFFlavour::X86_32_DarwinEH : DWARFFlavour::X86_32_Generic; + if (TheTriple.getOS() == Triple::MinGW32 || + TheTriple.getOS() == Triple::Cygwin) + // Unsupported by now, just quick fallback + return DWARFFlavour::X86_32_Generic; + return DWARFFlavour::X86_32_Generic; +} + +void X86_MC::InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI) { + // FIXME: TableGen these. + for (unsigned Reg = X86::NoRegister+1; Reg < X86::NUM_TARGET_REGS; ++Reg) { + unsigned SEH = MRI->getEncodingValue(Reg); + MRI->mapLLVMRegToSEHReg(Reg, SEH); + } +} + +MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(StringRef TT, StringRef CPU, + StringRef FS) { + std::string ArchFS = X86_MC::ParseX86Triple(TT); + if (!FS.empty()) { + if (!ArchFS.empty()) + ArchFS = ArchFS + "," + FS.str(); + else + ArchFS = FS; + } + + std::string CPUName = CPU; + if (CPUName.empty()) { +#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\ + || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64) + CPUName = sys::getHostCPUName(); +#else + CPUName = "generic"; +#endif + } + + MCSubtargetInfo *X = new MCSubtargetInfo(); + InitX86MCSubtargetInfo(X, TT, CPUName, ArchFS); + return X; +} + +static MCInstrInfo *createX86MCInstrInfo() { + MCInstrInfo *X = new MCInstrInfo(); + InitX86MCInstrInfo(X); + return X; +} + +static MCRegisterInfo *createX86MCRegisterInfo(StringRef TT) { + Triple TheTriple(TT); + unsigned RA = (TheTriple.getArch() == Triple::x86_64) + ? X86::RIP // Should have dwarf #16. + : X86::EIP; // Should have dwarf #8. + + MCRegisterInfo *X = new MCRegisterInfo(); + InitX86MCRegisterInfo(X, RA, + X86_MC::getDwarfRegFlavour(TT, false), + X86_MC::getDwarfRegFlavour(TT, true), + RA); + X86_MC::InitLLVM2SEHRegisterMapping(X); + return X; +} + +static MCAsmInfo *createX86MCAsmInfo(const Target &T, StringRef TT) { + Triple TheTriple(TT); + bool is64Bit = TheTriple.getArch() == Triple::x86_64; + + MCAsmInfo *MAI; + if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) { + if (is64Bit) + MAI = new X86_64MCAsmInfoDarwin(TheTriple); + else + MAI = new X86MCAsmInfoDarwin(TheTriple); + } else if (TheTriple.getEnvironment() == Triple::ELF) { + // Force the use of an ELF container. + MAI = new X86ELFMCAsmInfo(TheTriple); + } else if (TheTriple.getOS() == Triple::Win32) { + MAI = new X86MCAsmInfoMicrosoft(TheTriple); + } else if (TheTriple.getOS() == Triple::MinGW32 || TheTriple.getOS() == Triple::Cygwin) { + MAI = new X86MCAsmInfoGNUCOFF(TheTriple); + } else { + // The default is ELF. + MAI = new X86ELFMCAsmInfo(TheTriple); + } + + // Initialize initial frame state. + // Calculate amount of bytes used for return address storing + int stackGrowth = is64Bit ? -8 : -4; + + // Initial state of the frame pointer is esp+stackGrowth. + MachineLocation Dst(MachineLocation::VirtualFP); + MachineLocation Src(is64Bit ? X86::RSP : X86::ESP, stackGrowth); + MAI->addInitialFrameState(0, Dst, Src); + + // Add return address to move list + MachineLocation CSDst(is64Bit ? X86::RSP : X86::ESP, stackGrowth); + MachineLocation CSSrc(is64Bit ? X86::RIP : X86::EIP); + MAI->addInitialFrameState(0, CSDst, CSSrc); + + return MAI; +} + +static MCCodeGenInfo *createX86MCCodeGenInfo(StringRef TT, Reloc::Model RM, + CodeModel::Model CM, + CodeGenOpt::Level OL) { + MCCodeGenInfo *X = new MCCodeGenInfo(); + + Triple T(TT); + bool is64Bit = T.getArch() == Triple::x86_64; + + if (RM == Reloc::Default) { + // Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode. + // Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we + // use static relocation model by default. + if (T.isOSDarwin()) { + if (is64Bit) + RM = Reloc::PIC_; + else + RM = Reloc::DynamicNoPIC; + } else if (T.isOSWindows() && is64Bit) + RM = Reloc::PIC_; + else + RM = Reloc::Static; + } + + // ELF and X86-64 don't have a distinct DynamicNoPIC model. DynamicNoPIC + // is defined as a model for code which may be used in static or dynamic + // executables but not necessarily a shared library. On X86-32 we just + // compile in -static mode, in x86-64 we use PIC. + if (RM == Reloc::DynamicNoPIC) { + if (is64Bit) + RM = Reloc::PIC_; + else if (!T.isOSDarwin()) + RM = Reloc::Static; + } + + // If we are on Darwin, disallow static relocation model in X86-64 mode, since + // the Mach-O file format doesn't support it. + if (RM == Reloc::Static && T.isOSDarwin() && is64Bit) + RM = Reloc::PIC_; + + // For static codegen, if we're not already set, use Small codegen. + if (CM == CodeModel::Default) + CM = CodeModel::Small; + else if (CM == CodeModel::JITDefault) + // 64-bit JIT places everything in the same buffer except external funcs. + CM = is64Bit ? CodeModel::Large : CodeModel::Small; + + X->InitMCCodeGenInfo(RM, CM, OL); + return X; +} + +static MCStreamer *createMCStreamer(const Target &T, StringRef TT, + MCContext &Ctx, MCAsmBackend &MAB, + raw_ostream &_OS, + MCCodeEmitter *_Emitter, + bool RelaxAll, + bool NoExecStack) { + Triple TheTriple(TT); + + if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) + return createMachOStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll); + + if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF) + return createWinCOFFStreamer(Ctx, MAB, *_Emitter, _OS, RelaxAll); + + return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack); +} + +static MCInstPrinter *createX86MCInstPrinter(const Target &T, + unsigned SyntaxVariant, + const MCAsmInfo &MAI, + const MCInstrInfo &MII, + const MCRegisterInfo &MRI, + const MCSubtargetInfo &STI) { + if (SyntaxVariant == 0) + return new X86ATTInstPrinter(MAI, MII, MRI); + if (SyntaxVariant == 1) + return new X86IntelInstPrinter(MAI, MII, MRI); + return 0; +} + +static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) { + return new MCInstrAnalysis(Info); +} + +// Force static initialization. +extern "C" void LLVMInitializeX86TargetMC() { + // Register the MC asm info. + RegisterMCAsmInfoFn A(TheX86_32Target, createX86MCAsmInfo); + RegisterMCAsmInfoFn B(TheX86_64Target, createX86MCAsmInfo); + + // Register the MC codegen info. + RegisterMCCodeGenInfoFn C(TheX86_32Target, createX86MCCodeGenInfo); + RegisterMCCodeGenInfoFn D(TheX86_64Target, createX86MCCodeGenInfo); + + // Register the MC instruction info. + TargetRegistry::RegisterMCInstrInfo(TheX86_32Target, createX86MCInstrInfo); + TargetRegistry::RegisterMCInstrInfo(TheX86_64Target, createX86MCInstrInfo); + + // Register the MC register info. + TargetRegistry::RegisterMCRegInfo(TheX86_32Target, createX86MCRegisterInfo); + TargetRegistry::RegisterMCRegInfo(TheX86_64Target, createX86MCRegisterInfo); + + // Register the MC subtarget info. + TargetRegistry::RegisterMCSubtargetInfo(TheX86_32Target, + X86_MC::createX86MCSubtargetInfo); + TargetRegistry::RegisterMCSubtargetInfo(TheX86_64Target, + X86_MC::createX86MCSubtargetInfo); + + // Register the MC instruction analyzer. + TargetRegistry::RegisterMCInstrAnalysis(TheX86_32Target, + createX86MCInstrAnalysis); + TargetRegistry::RegisterMCInstrAnalysis(TheX86_64Target, + createX86MCInstrAnalysis); + + // Register the code emitter. + TargetRegistry::RegisterMCCodeEmitter(TheX86_32Target, + createX86MCCodeEmitter); + TargetRegistry::RegisterMCCodeEmitter(TheX86_64Target, + createX86MCCodeEmitter); + + // Register the asm backend. + TargetRegistry::RegisterMCAsmBackend(TheX86_32Target, + createX86_32AsmBackend); + TargetRegistry::RegisterMCAsmBackend(TheX86_64Target, + createX86_64AsmBackend); + + // Register the object streamer. + TargetRegistry::RegisterMCObjectStreamer(TheX86_32Target, + createMCStreamer); + TargetRegistry::RegisterMCObjectStreamer(TheX86_64Target, + createMCStreamer); + + // Register the MCInstPrinter. + TargetRegistry::RegisterMCInstPrinter(TheX86_32Target, + createX86MCInstPrinter); + TargetRegistry::RegisterMCInstPrinter(TheX86_64Target, + createX86MCInstPrinter); +} diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h new file mode 100644 index 0000000..981aa1a --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -0,0 +1,114 @@ +//===-- X86MCTargetDesc.h - X86 Target Descriptions -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides X86 specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#ifndef X86MCTARGETDESC_H +#define X86MCTARGETDESC_H + +#include "llvm/Support/DataTypes.h" +#include <string> + +namespace llvm { +class MCAsmBackend; +class MCCodeEmitter; +class MCContext; +class MCInstrInfo; +class MCObjectWriter; +class MCRegisterInfo; +class MCSubtargetInfo; +class Target; +class StringRef; +class raw_ostream; + +extern Target TheX86_32Target, TheX86_64Target; + +/// DWARFFlavour - Flavour of dwarf regnumbers +/// +namespace DWARFFlavour { + enum { + X86_64 = 0, X86_32_DarwinEH = 1, X86_32_Generic = 2 + }; +} + +/// N86 namespace - Native X86 register numbers +/// +namespace N86 { + enum { + EAX = 0, ECX = 1, EDX = 2, EBX = 3, ESP = 4, EBP = 5, ESI = 6, EDI = 7 + }; +} + +namespace X86_MC { + std::string ParseX86Triple(StringRef TT); + + /// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in + /// the specified arguments. If we can't run cpuid on the host, return true. + bool GetCpuIDAndInfo(unsigned value, unsigned *rEAX, + unsigned *rEBX, unsigned *rECX, unsigned *rEDX); + /// GetCpuIDAndInfoEx - Execute the specified cpuid with subleaf and return + /// the 4 values in the specified arguments. If we can't run cpuid on the + /// host, return true. + bool GetCpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX, + unsigned *rEBX, unsigned *rECX, unsigned *rEDX); + + void DetectFamilyModel(unsigned EAX, unsigned &Family, unsigned &Model); + + unsigned getDwarfRegFlavour(StringRef TT, bool isEH); + + void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI); + + /// createX86MCSubtargetInfo - Create a X86 MCSubtargetInfo instance. + /// This is exposed so Asm parser, etc. do not need to go through + /// TargetRegistry. + MCSubtargetInfo *createX86MCSubtargetInfo(StringRef TT, StringRef CPU, + StringRef FS); +} + +MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + const MCSubtargetInfo &STI, + MCContext &Ctx); + +MCAsmBackend *createX86_32AsmBackend(const Target &T, StringRef TT, StringRef CPU); +MCAsmBackend *createX86_64AsmBackend(const Target &T, StringRef TT, StringRef CPU); + +/// createX86MachObjectWriter - Construct an X86 Mach-O object writer. +MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS, + bool Is64Bit, + uint32_t CPUType, + uint32_t CPUSubtype); + +/// createX86ELFObjectWriter - Construct an X86 ELF object writer. +MCObjectWriter *createX86ELFObjectWriter(raw_ostream &OS, + bool IsELF64, + uint8_t OSABI, + uint16_t EMachine); +/// createX86WinCOFFObjectWriter - Construct an X86 Win COFF object writer. +MCObjectWriter *createX86WinCOFFObjectWriter(raw_ostream &OS, bool Is64Bit); +} // End llvm namespace + + +// Defines symbolic names for X86 registers. This defines a mapping from +// register name to register number. +// +#define GET_REGINFO_ENUM +#include "X86GenRegisterInfo.inc" + +// Defines symbolic names for the X86 instructions. +// +#define GET_INSTRINFO_ENUM +#include "X86GenInstrInfo.inc" + +#define GET_SUBTARGETINFO_ENUM +#include "X86GenSubtargetInfo.inc" + +#endif diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp new file mode 100644 index 0000000..64f005c --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -0,0 +1,586 @@ +//===-- X86MachObjectWriter.cpp - X86 Mach-O Writer -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "MCTargetDesc/X86FixupKinds.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCAsmLayout.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCMachObjectWriter.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Object/MachOFormat.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" + +using namespace llvm; +using namespace llvm::object; + +namespace { +class X86MachObjectWriter : public MCMachObjectTargetWriter { + bool RecordScatteredRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + unsigned Log2Size, + uint64_t &FixedValue); + void RecordTLVPRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + uint64_t &FixedValue); + + void RecordX86Relocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + uint64_t &FixedValue); + void RecordX86_64Relocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + uint64_t &FixedValue); +public: + X86MachObjectWriter(bool Is64Bit, uint32_t CPUType, + uint32_t CPUSubtype) + : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype, + /*UseAggressiveSymbolFolding=*/Is64Bit) {} + + void RecordRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, const MCAsmLayout &Layout, + const MCFragment *Fragment, const MCFixup &Fixup, + MCValue Target, uint64_t &FixedValue) { + if (Writer->is64Bit()) + RecordX86_64Relocation(Writer, Asm, Layout, Fragment, Fixup, Target, + FixedValue); + else + RecordX86Relocation(Writer, Asm, Layout, Fragment, Fixup, Target, + FixedValue); + } +}; +} + +static bool isFixupKindRIPRel(unsigned Kind) { + return Kind == X86::reloc_riprel_4byte || + Kind == X86::reloc_riprel_4byte_movq_load; +} + +static unsigned getFixupKindLog2Size(unsigned Kind) { + switch (Kind) { + default: + llvm_unreachable("invalid fixup kind!"); + case FK_PCRel_1: + case FK_Data_1: return 0; + case FK_PCRel_2: + case FK_Data_2: return 1; + case FK_PCRel_4: + // FIXME: Remove these!!! + case X86::reloc_riprel_4byte: + case X86::reloc_riprel_4byte_movq_load: + case X86::reloc_signed_4byte: + case FK_Data_4: return 2; + case FK_Data_8: return 3; + } +} + +void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + uint64_t &FixedValue) { + unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); + unsigned IsRIPRel = isFixupKindRIPRel(Fixup.getKind()); + unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind()); + + // See <reloc.h>. + uint32_t FixupOffset = + Layout.getFragmentOffset(Fragment) + Fixup.getOffset(); + uint32_t FixupAddress = + Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset(); + int64_t Value = 0; + unsigned Index = 0; + unsigned IsExtern = 0; + unsigned Type = 0; + + Value = Target.getConstant(); + + if (IsPCRel) { + // Compensate for the relocation offset, Darwin x86_64 relocations only have + // the addend and appear to have attempted to define it to be the actual + // expression addend without the PCrel bias. However, instructions with data + // following the relocation are not accommodated for (see comment below + // regarding SIGNED{1,2,4}), so it isn't exactly that either. + Value += 1LL << Log2Size; + } + + if (Target.isAbsolute()) { // constant + // SymbolNum of 0 indicates the absolute section. + Type = macho::RIT_X86_64_Unsigned; + Index = 0; + + // FIXME: I believe this is broken, I don't think the linker can understand + // it. I think it would require a local relocation, but I'm not sure if that + // would work either. The official way to get an absolute PCrel relocation + // is to use an absolute symbol (which we don't support yet). + if (IsPCRel) { + IsExtern = 1; + Type = macho::RIT_X86_64_Branch; + } + } else if (Target.getSymB()) { // A - B + constant + const MCSymbol *A = &Target.getSymA()->getSymbol(); + MCSymbolData &A_SD = Asm.getSymbolData(*A); + const MCSymbolData *A_Base = Asm.getAtom(&A_SD); + + const MCSymbol *B = &Target.getSymB()->getSymbol(); + MCSymbolData &B_SD = Asm.getSymbolData(*B); + const MCSymbolData *B_Base = Asm.getAtom(&B_SD); + + // Neither symbol can be modified. + if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None || + Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) + report_fatal_error("unsupported relocation of modified symbol"); + + // We don't support PCrel relocations of differences. Darwin 'as' doesn't + // implement most of these correctly. + if (IsPCRel) + report_fatal_error("unsupported pc-relative relocation of difference"); + + // The support for the situation where one or both of the symbols would + // require a local relocation is handled just like if the symbols were + // external. This is certainly used in the case of debug sections where the + // section has only temporary symbols and thus the symbols don't have base + // symbols. This is encoded using the section ordinal and non-extern + // relocation entries. + + // Darwin 'as' doesn't emit correct relocations for this (it ends up with a + // single SIGNED relocation); reject it for now. Except the case where both + // symbols don't have a base, equal but both NULL. + if (A_Base == B_Base && A_Base) + report_fatal_error("unsupported relocation with identical base"); + + Value += Writer->getSymbolAddress(&A_SD, Layout) - + (A_Base == NULL ? 0 : Writer->getSymbolAddress(A_Base, Layout)); + Value -= Writer->getSymbolAddress(&B_SD, Layout) - + (B_Base == NULL ? 0 : Writer->getSymbolAddress(B_Base, Layout)); + + if (A_Base) { + Index = A_Base->getIndex(); + IsExtern = 1; + } + else { + Index = A_SD.getFragment()->getParent()->getOrdinal() + 1; + IsExtern = 0; + } + Type = macho::RIT_X86_64_Unsigned; + + macho::RelocationEntry MRE; + MRE.Word0 = FixupOffset; + MRE.Word1 = ((Index << 0) | + (IsPCRel << 24) | + (Log2Size << 25) | + (IsExtern << 27) | + (Type << 28)); + Writer->addRelocation(Fragment->getParent(), MRE); + + if (B_Base) { + Index = B_Base->getIndex(); + IsExtern = 1; + } + else { + Index = B_SD.getFragment()->getParent()->getOrdinal() + 1; + IsExtern = 0; + } + Type = macho::RIT_X86_64_Subtractor; + } else { + const MCSymbol *Symbol = &Target.getSymA()->getSymbol(); + MCSymbolData &SD = Asm.getSymbolData(*Symbol); + const MCSymbolData *Base = Asm.getAtom(&SD); + + // Relocations inside debug sections always use local relocations when + // possible. This seems to be done because the debugger doesn't fully + // understand x86_64 relocation entries, and expects to find values that + // have already been fixed up. + if (Symbol->isInSection()) { + const MCSectionMachO &Section = static_cast<const MCSectionMachO&>( + Fragment->getParent()->getSection()); + if (Section.hasAttribute(MCSectionMachO::S_ATTR_DEBUG)) + Base = 0; + } + + // x86_64 almost always uses external relocations, except when there is no + // symbol to use as a base address (a local symbol with no preceding + // non-local symbol). + if (Base) { + Index = Base->getIndex(); + IsExtern = 1; + + // Add the local offset, if needed. + if (Base != &SD) + Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base); + } else if (Symbol->isInSection() && !Symbol->isVariable()) { + // The index is the section ordinal (1-based). + Index = SD.getFragment()->getParent()->getOrdinal() + 1; + IsExtern = 0; + Value += Writer->getSymbolAddress(&SD, Layout); + + if (IsPCRel) + Value -= FixupAddress + (1 << Log2Size); + } else if (Symbol->isVariable()) { + const MCExpr *Value = Symbol->getVariableValue(); + int64_t Res; + bool isAbs = Value->EvaluateAsAbsolute(Res, Layout, + Writer->getSectionAddressMap()); + if (isAbs) { + FixedValue = Res; + return; + } else { + report_fatal_error("unsupported relocation of variable '" + + Symbol->getName() + "'"); + } + } else { + report_fatal_error("unsupported relocation of undefined symbol '" + + Symbol->getName() + "'"); + } + + MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind(); + if (IsPCRel) { + if (IsRIPRel) { + if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) { + // x86_64 distinguishes movq foo@GOTPCREL so that the linker can + // rewrite the movq to an leaq at link time if the symbol ends up in + // the same linkage unit. + if (unsigned(Fixup.getKind()) == X86::reloc_riprel_4byte_movq_load) + Type = macho::RIT_X86_64_GOTLoad; + else + Type = macho::RIT_X86_64_GOT; + } else if (Modifier == MCSymbolRefExpr::VK_TLVP) { + Type = macho::RIT_X86_64_TLV; + } else if (Modifier != MCSymbolRefExpr::VK_None) { + report_fatal_error("unsupported symbol modifier in relocation"); + } else { + Type = macho::RIT_X86_64_Signed; + + // The Darwin x86_64 relocation format has a problem where it cannot + // encode an address (L<foo> + <constant>) which is outside the atom + // containing L<foo>. Generally, this shouldn't occur but it does + // happen when we have a RIPrel instruction with data following the + // relocation entry (e.g., movb $012, L0(%rip)). Even with the PCrel + // adjustment Darwin x86_64 uses, the offset is still negative and the + // linker has no way to recognize this. + // + // To work around this, Darwin uses several special relocation types + // to indicate the offsets. However, the specification or + // implementation of these seems to also be incomplete; they should + // adjust the addend as well based on the actual encoded instruction + // (the additional bias), but instead appear to just look at the final + // offset. + switch (-(Target.getConstant() + (1LL << Log2Size))) { + case 1: Type = macho::RIT_X86_64_Signed1; break; + case 2: Type = macho::RIT_X86_64_Signed2; break; + case 4: Type = macho::RIT_X86_64_Signed4; break; + } + } + } else { + if (Modifier != MCSymbolRefExpr::VK_None) + report_fatal_error("unsupported symbol modifier in branch " + "relocation"); + + Type = macho::RIT_X86_64_Branch; + } + } else { + if (Modifier == MCSymbolRefExpr::VK_GOT) { + Type = macho::RIT_X86_64_GOT; + } else if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) { + // GOTPCREL is allowed as a modifier on non-PCrel instructions, in which + // case all we do is set the PCrel bit in the relocation entry; this is + // used with exception handling, for example. The source is required to + // include any necessary offset directly. + Type = macho::RIT_X86_64_GOT; + IsPCRel = 1; + } else if (Modifier == MCSymbolRefExpr::VK_TLVP) { + report_fatal_error("TLVP symbol modifier should have been rip-rel"); + } else if (Modifier != MCSymbolRefExpr::VK_None) + report_fatal_error("unsupported symbol modifier in relocation"); + else + Type = macho::RIT_X86_64_Unsigned; + } + } + + // x86_64 always writes custom values into the fixups. + FixedValue = Value; + + // struct relocation_info (8 bytes) + macho::RelocationEntry MRE; + MRE.Word0 = FixupOffset; + MRE.Word1 = ((Index << 0) | + (IsPCRel << 24) | + (Log2Size << 25) | + (IsExtern << 27) | + (Type << 28)); + Writer->addRelocation(Fragment->getParent(), MRE); +} + +bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + unsigned Log2Size, + uint64_t &FixedValue) { + uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); + unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); + unsigned Type = macho::RIT_Vanilla; + + // See <reloc.h>. + const MCSymbol *A = &Target.getSymA()->getSymbol(); + MCSymbolData *A_SD = &Asm.getSymbolData(*A); + + if (!A_SD->getFragment()) + report_fatal_error("symbol '" + A->getName() + + "' can not be undefined in a subtraction expression"); + + uint32_t Value = Writer->getSymbolAddress(A_SD, Layout); + uint64_t SecAddr = Writer->getSectionAddress(A_SD->getFragment()->getParent()); + FixedValue += SecAddr; + uint32_t Value2 = 0; + + if (const MCSymbolRefExpr *B = Target.getSymB()) { + MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol()); + + if (!B_SD->getFragment()) + report_fatal_error("symbol '" + B->getSymbol().getName() + + "' can not be undefined in a subtraction expression"); + + // Select the appropriate difference relocation type. + // + // Note that there is no longer any semantic difference between these two + // relocation types from the linkers point of view, this is done solely for + // pedantic compatibility with 'as'. + Type = A_SD->isExternal() ? (unsigned)macho::RIT_Difference : + (unsigned)macho::RIT_Generic_LocalDifference; + Value2 = Writer->getSymbolAddress(B_SD, Layout); + FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent()); + } + + // Relocations are written out in reverse order, so the PAIR comes first. + if (Type == macho::RIT_Difference || + Type == macho::RIT_Generic_LocalDifference) { + // If the offset is too large to fit in a scattered relocation, + // we're hosed. It's an unfortunate limitation of the MachO format. + if (FixupOffset > 0xffffff) { + char Buffer[32]; + format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer)); + Asm.getContext().FatalError(Fixup.getLoc(), + Twine("Section too large, can't encode " + "r_address (") + Buffer + + ") into 24 bits of scattered " + "relocation entry."); + llvm_unreachable("fatal error returned?!"); + } + + macho::RelocationEntry MRE; + MRE.Word0 = ((0 << 0) | + (macho::RIT_Pair << 24) | + (Log2Size << 28) | + (IsPCRel << 30) | + macho::RF_Scattered); + MRE.Word1 = Value2; + Writer->addRelocation(Fragment->getParent(), MRE); + } else { + // If the offset is more than 24-bits, it won't fit in a scattered + // relocation offset field, so we fall back to using a non-scattered + // relocation. This is a bit risky, as if the offset reaches out of + // the block and the linker is doing scattered loading on this + // symbol, things can go badly. + // + // Required for 'as' compatibility. + if (FixupOffset > 0xffffff) + return false; + } + + macho::RelocationEntry MRE; + MRE.Word0 = ((FixupOffset << 0) | + (Type << 24) | + (Log2Size << 28) | + (IsPCRel << 30) | + macho::RF_Scattered); + MRE.Word1 = Value; + Writer->addRelocation(Fragment->getParent(), MRE); + return true; +} + +void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + uint64_t &FixedValue) { + assert(Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP && + !is64Bit() && + "Should only be called with a 32-bit TLVP relocation!"); + + unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind()); + uint32_t Value = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); + unsigned IsPCRel = 0; + + // Get the symbol data. + MCSymbolData *SD_A = &Asm.getSymbolData(Target.getSymA()->getSymbol()); + unsigned Index = SD_A->getIndex(); + + // We're only going to have a second symbol in pic mode and it'll be a + // subtraction from the picbase. For 32-bit pic the addend is the difference + // between the picbase and the next address. For 32-bit static the addend is + // zero. + if (Target.getSymB()) { + // If this is a subtraction then we're pcrel. + uint32_t FixupAddress = + Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset(); + MCSymbolData *SD_B = &Asm.getSymbolData(Target.getSymB()->getSymbol()); + IsPCRel = 1; + FixedValue = (FixupAddress - Writer->getSymbolAddress(SD_B, Layout) + + Target.getConstant()); + FixedValue += 1ULL << Log2Size; + } else { + FixedValue = 0; + } + + // struct relocation_info (8 bytes) + macho::RelocationEntry MRE; + MRE.Word0 = Value; + MRE.Word1 = ((Index << 0) | + (IsPCRel << 24) | + (Log2Size << 25) | + (1 << 27) | // Extern + (macho::RIT_Generic_TLV << 28)); // Type + Writer->addRelocation(Fragment->getParent(), MRE); +} + +void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + uint64_t &FixedValue) { + unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); + unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind()); + + // If this is a 32-bit TLVP reloc it's handled a bit differently. + if (Target.getSymA() && + Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP) { + RecordTLVPRelocation(Writer, Asm, Layout, Fragment, Fixup, Target, + FixedValue); + return; + } + + // If this is a difference or a defined symbol plus an offset, then we need a + // scattered relocation entry. Differences always require scattered + // relocations. + if (Target.getSymB()) { + RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, + Target, Log2Size, FixedValue); + return; + } + + // Get the symbol data, if any. + MCSymbolData *SD = 0; + if (Target.getSymA()) + SD = &Asm.getSymbolData(Target.getSymA()->getSymbol()); + + // If this is an internal relocation with an offset, it also needs a scattered + // relocation entry. + uint32_t Offset = Target.getConstant(); + if (IsPCRel) + Offset += 1 << Log2Size; + // Try to record the scattered relocation if needed. Fall back to non + // scattered if necessary (see comments in RecordScatteredRelocation() + // for details). + if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD) && + RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, + Target, Log2Size, FixedValue)) + return; + + // See <reloc.h>. + uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); + unsigned Index = 0; + unsigned IsExtern = 0; + unsigned Type = 0; + + if (Target.isAbsolute()) { // constant + // SymbolNum of 0 indicates the absolute section. + // + // FIXME: Currently, these are never generated (see code below). I cannot + // find a case where they are actually emitted. + Type = macho::RIT_Vanilla; + } else { + // Resolve constant variables. + if (SD->getSymbol().isVariable()) { + int64_t Res; + if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute( + Res, Layout, Writer->getSectionAddressMap())) { + FixedValue = Res; + return; + } + } + + // Check whether we need an external or internal relocation. + if (Writer->doesSymbolRequireExternRelocation(SD)) { + IsExtern = 1; + Index = SD->getIndex(); + // For external relocations, make sure to offset the fixup value to + // compensate for the addend of the symbol address, if it was + // undefined. This occurs with weak definitions, for example. + if (!SD->Symbol->isUndefined()) + FixedValue -= Layout.getSymbolOffset(SD); + } else { + // The index is the section ordinal (1-based). + const MCSectionData &SymSD = Asm.getSectionData( + SD->getSymbol().getSection()); + Index = SymSD.getOrdinal() + 1; + FixedValue += Writer->getSectionAddress(&SymSD); + } + if (IsPCRel) + FixedValue -= Writer->getSectionAddress(Fragment->getParent()); + + Type = macho::RIT_Vanilla; + } + + // struct relocation_info (8 bytes) + macho::RelocationEntry MRE; + MRE.Word0 = FixupOffset; + MRE.Word1 = ((Index << 0) | + (IsPCRel << 24) | + (Log2Size << 25) | + (IsExtern << 27) | + (Type << 28)); + Writer->addRelocation(Fragment->getParent(), MRE); +} + +MCObjectWriter *llvm::createX86MachObjectWriter(raw_ostream &OS, + bool Is64Bit, + uint32_t CPUType, + uint32_t CPUSubtype) { + return createMachObjectWriter(new X86MachObjectWriter(Is64Bit, + CPUType, + CPUSubtype), + OS, /*IsLittleEndian=*/true); +} diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp new file mode 100644 index 0000000..ed64a32 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp @@ -0,0 +1,79 @@ +//===-- X86WinCOFFObjectWriter.cpp - X86 Win COFF Writer ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86FixupKinds.h" +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCValue.h" +#include "llvm/MC/MCWinCOFFObjectWriter.h" +#include "llvm/Support/COFF.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; + +namespace llvm { + class MCObjectWriter; +} + +namespace { + class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter { + const bool Is64Bit; + + public: + X86WinCOFFObjectWriter(bool Is64Bit_); + ~X86WinCOFFObjectWriter(); + + virtual unsigned getRelocType(const MCValue &Target, + const MCFixup &Fixup, + bool IsCrossSection) const LLVM_OVERRIDE; + }; +} + +X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit_) + : MCWinCOFFObjectTargetWriter(Is64Bit_ ? COFF::IMAGE_FILE_MACHINE_AMD64 : + COFF::IMAGE_FILE_MACHINE_I386), + Is64Bit(Is64Bit_) {} + +X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {} + +unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target, + const MCFixup &Fixup, + bool IsCrossSection) const { + unsigned FixupKind = IsCrossSection ? FK_PCRel_4 : Fixup.getKind(); + + MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ? + MCSymbolRefExpr::VK_None : Target.getSymA()->getKind(); + + switch (FixupKind) { + case FK_PCRel_4: + case X86::reloc_riprel_4byte: + case X86::reloc_riprel_4byte_movq_load: + return Is64Bit ? COFF::IMAGE_REL_AMD64_REL32 : COFF::IMAGE_REL_I386_REL32; + case FK_Data_4: + case X86::reloc_signed_4byte: + if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32) + return Is64Bit ? COFF::IMAGE_REL_AMD64_ADDR32NB : + COFF::IMAGE_REL_I386_DIR32NB; + return Is64Bit ? COFF::IMAGE_REL_AMD64_ADDR32 : COFF::IMAGE_REL_I386_DIR32; + case FK_Data_8: + if (Is64Bit) + return COFF::IMAGE_REL_AMD64_ADDR64; + llvm_unreachable("unsupported relocation type"); + case FK_SecRel_4: + return Is64Bit ? COFF::IMAGE_REL_AMD64_SECREL : COFF::IMAGE_REL_I386_SECREL; + default: + llvm_unreachable("unsupported relocation type"); + } +} + +MCObjectWriter *llvm::createX86WinCOFFObjectWriter(raw_ostream &OS, + bool Is64Bit) { + MCWinCOFFObjectTargetWriter *MOTW = new X86WinCOFFObjectWriter(Is64Bit); + return createWinCOFFObjectWriter(MOTW, OS); +} diff --git a/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp new file mode 100644 index 0000000..815d235 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp @@ -0,0 +1,23 @@ +//===-- X86TargetInfo.cpp - X86 Target Implementation ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/TargetRegistry.h" +using namespace llvm; + +Target llvm::TheX86_32Target, llvm::TheX86_64Target; + +extern "C" void LLVMInitializeX86TargetInfo() { + RegisterTarget<Triple::x86, /*HasJIT=*/true> + X(TheX86_32Target, "x86", "32-bit X86: Pentium-Pro and above"); + + RegisterTarget<Triple::x86_64, /*HasJIT=*/true> + Y(TheX86_64Target, "x86-64", "64-bit X86: EM64T and AMD64"); +} diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp new file mode 100644 index 0000000..bbd4904 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -0,0 +1,217 @@ +//===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Define several functions to decode x86 specific shuffle semantics into a +// generic vector mask. +// +//===----------------------------------------------------------------------===// + +#include "X86ShuffleDecode.h" + +//===----------------------------------------------------------------------===// +// Vector Mask Decoding +//===----------------------------------------------------------------------===// + +namespace llvm { + +void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { + // Defaults the copying the dest value. + ShuffleMask.push_back(0); + ShuffleMask.push_back(1); + ShuffleMask.push_back(2); + ShuffleMask.push_back(3); + + // Decode the immediate. + unsigned ZMask = Imm & 15; + unsigned CountD = (Imm >> 4) & 3; + unsigned CountS = (Imm >> 6) & 3; + + // CountS selects which input element to use. + unsigned InVal = 4+CountS; + // CountD specifies which element of destination to update. + ShuffleMask[CountD] = InVal; + // ZMask zaps values, potentially overriding the CountD elt. + if (ZMask & 1) ShuffleMask[0] = SM_SentinelZero; + if (ZMask & 2) ShuffleMask[1] = SM_SentinelZero; + if (ZMask & 4) ShuffleMask[2] = SM_SentinelZero; + if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero; +} + +// <3,1> or <6,7,2,3> +void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) { + for (unsigned i = NElts/2; i != NElts; ++i) + ShuffleMask.push_back(NElts+i); + + for (unsigned i = NElts/2; i != NElts; ++i) + ShuffleMask.push_back(i); +} + +// <0,2> or <0,1,4,5> +void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) { + for (unsigned i = 0; i != NElts/2; ++i) + ShuffleMask.push_back(i); + + for (unsigned i = 0; i != NElts/2; ++i) + ShuffleMask.push_back(NElts+i); +} + +void DecodePALIGNRMask(MVT VT, unsigned Imm, + SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned Offset = Imm * (VT.getVectorElementType().getSizeInBits() / 8); + + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumLaneElts = NumElts / NumLanes; + + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0; i != NumLaneElts; ++i) { + unsigned Base = i + Offset; + // if i+offset is out of this lane then we actually need the other source + if (Base >= NumLaneElts) Base += NumElts - NumLaneElts; + ShuffleMask.push_back(Base + l); + } + } +} + +/// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*. +/// VT indicates the type of the vector allowing it to handle different +/// datatypes and vector widths. +void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumLaneElts = NumElts / NumLanes; + + unsigned NewImm = Imm; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0; i != NumLaneElts; ++i) { + ShuffleMask.push_back(NewImm % NumLaneElts + l); + NewImm /= NumLaneElts; + } + if (NumLaneElts == 4) NewImm = Imm; // reload imm + } +} + +void DecodePSHUFHWMask(MVT VT, unsigned Imm, + SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + + for (unsigned l = 0; l != NumElts; l += 8) { + unsigned NewImm = Imm; + for (unsigned i = 0, e = 4; i != e; ++i) { + ShuffleMask.push_back(l + i); + } + for (unsigned i = 4, e = 8; i != e; ++i) { + ShuffleMask.push_back(l + 4 + (NewImm & 3)); + NewImm >>= 2; + } + } +} + +void DecodePSHUFLWMask(MVT VT, unsigned Imm, + SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + + for (unsigned l = 0; l != NumElts; l += 8) { + unsigned NewImm = Imm; + for (unsigned i = 0, e = 4; i != e; ++i) { + ShuffleMask.push_back(l + (NewImm & 3)); + NewImm >>= 2; + } + for (unsigned i = 4, e = 8; i != e; ++i) { + ShuffleMask.push_back(l + i); + } + } +} + +/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates +/// the type of the vector allowing it to handle different datatypes and vector +/// widths. +void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumLaneElts = NumElts / NumLanes; + + unsigned NewImm = Imm; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + // each half of a lane comes from different source + for (unsigned s = 0; s != NumElts*2; s += NumElts) { + for (unsigned i = 0; i != NumLaneElts/2; ++i) { + ShuffleMask.push_back(NewImm % NumLaneElts + s + l); + NewImm /= NumLaneElts; + } + } + if (NumLaneElts == 4) NewImm = Imm; // reload imm + } +} + +/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd +/// and punpckh*. VT indicates the type of the vector allowing it to handle +/// different datatypes and vector widths. +void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + + // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate + // independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits() / 128; + if (NumLanes == 0 ) NumLanes = 1; // Handle MMX + unsigned NumLaneElts = NumElts / NumLanes; + + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = l + NumLaneElts/2, e = l + NumLaneElts; i != e; ++i) { + ShuffleMask.push_back(i); // Reads from dest/src1 + ShuffleMask.push_back(i+NumElts); // Reads from src/src2 + } + } +} + +/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd +/// and punpckl*. VT indicates the type of the vector allowing it to handle +/// different datatypes and vector widths. +void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + + // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate + // independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits() / 128; + if (NumLanes == 0 ) NumLanes = 1; // Handle MMX + unsigned NumLaneElts = NumElts / NumLanes; + + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = l, e = l + NumLaneElts/2; i != e; ++i) { + ShuffleMask.push_back(i); // Reads from dest/src1 + ShuffleMask.push_back(i+NumElts); // Reads from src/src2 + } + } +} + +void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, + SmallVectorImpl<int> &ShuffleMask) { + if (Imm & 0x88) + return; // Not a shuffle + + unsigned HalfSize = VT.getVectorNumElements()/2; + + for (unsigned l = 0; l != 2; ++l) { + unsigned HalfBegin = ((Imm >> (l*4)) & 0x3) * HalfSize; + for (unsigned i = HalfBegin, e = HalfBegin+HalfSize; i != e; ++i) + ShuffleMask.push_back(i); + } +} + +/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD. +/// No VT provided since it only works on 256-bit, 4 element vectors. +void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { + for (unsigned i = 0; i != 4; ++i) { + ShuffleMask.push_back((Imm >> (2*i)) & 3); + } +} + +} // llvm namespace diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h new file mode 100644 index 0000000..017ab32 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -0,0 +1,71 @@ +//===-- X86ShuffleDecode.h - X86 shuffle decode logic -----------*-C++-*---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Define several functions to decode x86 specific shuffle semantics into a +// generic vector mask. +// +//===----------------------------------------------------------------------===// + +#ifndef X86_SHUFFLE_DECODE_H +#define X86_SHUFFLE_DECODE_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/ValueTypes.h" + +//===----------------------------------------------------------------------===// +// Vector Mask Decoding +//===----------------------------------------------------------------------===// + +namespace llvm { +enum { + SM_SentinelZero = -1 +}; + +void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask); + +// <3,1> or <6,7,2,3> +void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask); + +// <0,2> or <0,1,4,5> +void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask); + +void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); + +void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); + +void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); + +void DecodePSHUFLWMask(MVT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); + +/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates +/// the type of the vector allowing it to handle different datatypes and vector +/// widths. +void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); + +/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd +/// and punpckh*. VT indicates the type of the vector allowing it to handle +/// different datatypes and vector widths. +void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); + +/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd +/// and punpckl*. VT indicates the type of the vector allowing it to handle +/// different datatypes and vector widths. +void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); + + +void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, + SmallVectorImpl<int> &ShuffleMask); + +/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD. +/// No VT provided since it only works on 256-bit, 4 element vectors. +void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask); + +} // llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h new file mode 100644 index 0000000..947002f --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86.h @@ -0,0 +1,80 @@ +//===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in the x86 +// target library, as used by the LLVM JIT. +// +//===----------------------------------------------------------------------===// + +#ifndef TARGET_X86_H +#define TARGET_X86_H + +#include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "llvm/Support/DataTypes.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + +class FunctionPass; +class JITCodeEmitter; +class X86TargetMachine; + +/// createX86ISelDag - This pass converts a legalized DAG into a +/// X86-specific DAG, ready for instruction scheduling. +/// +FunctionPass *createX86ISelDag(X86TargetMachine &TM, + CodeGenOpt::Level OptLevel); + +/// createGlobalBaseRegPass - This pass initializes a global base +/// register for PIC on x86-32. +FunctionPass* createGlobalBaseRegPass(); + +/// createCleanupLocalDynamicTLSPass() - This pass combines multiple accesses +/// to local-dynamic TLS variables so that the TLS base address for the module +/// is only fetched once per execution path through the function. +FunctionPass *createCleanupLocalDynamicTLSPass(); + +/// createX86FloatingPointStackifierPass - This function returns a pass which +/// converts floating point register references and pseudo instructions into +/// floating point stack references and physical instructions. +/// +FunctionPass *createX86FloatingPointStackifierPass(); + +/// createX86IssueVZeroUpperPass - This pass inserts AVX vzeroupper instructions +/// before each call to avoid transition penalty between functions encoded with +/// AVX and SSE. +FunctionPass *createX86IssueVZeroUpperPass(); + +/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code +/// to the specified MCE object. +FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM, + JITCodeEmitter &JCE); + +/// createX86EmitCodeToMemory - Returns a pass that converts a register +/// allocated function into raw machine code in a dynamically +/// allocated chunk of memory. +/// +FunctionPass *createEmitX86CodeToMemory(); + +/// \brief Creates an X86-specific Target Transformation Info pass. +ImmutablePass *createX86TargetTransformInfoPass(const X86TargetMachine *TM); + +/// createX86PadShortFunctions - Return a pass that pads short functions +/// with NOOPs. This will prevent a stall when returning on the Atom. +FunctionPass *createX86PadShortFunctions(); +/// createX86FixupLEAs - Return a a pass that selectively replaces +/// certain instructions (like add, sub, inc, dec, some shifts, +/// and some multiplies) by equivalent LEA instructions, in order +/// to eliminate execution delays in some Atom processors. +FunctionPass *createX86FixupLEAs(); + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td new file mode 100644 index 0000000..c865500 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86.td @@ -0,0 +1,357 @@ +//===-- X86.td - Target definition file for the Intel X86 --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is a target description file for the Intel i386 architecture, referred +// to here as the "X86" architecture. +// +//===----------------------------------------------------------------------===// + +// Get the target-independent interfaces which we are implementing... +// +include "llvm/Target/Target.td" + +//===----------------------------------------------------------------------===// +// X86 Subtarget state +// + +def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true", + "64-bit mode (x86_64)">; + +//===----------------------------------------------------------------------===// +// X86 Subtarget features +//===----------------------------------------------------------------------===// + +def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true", + "Enable conditional move instructions">; + +def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true", + "Support POPCNT instruction">; + + +def FeatureMMX : SubtargetFeature<"mmx","X86SSELevel", "MMX", + "Enable MMX instructions">; +def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1", + "Enable SSE instructions", + // SSE codegen depends on cmovs, and all + // SSE1+ processors support them. + [FeatureMMX, FeatureCMOV]>; +def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2", + "Enable SSE2 instructions", + [FeatureSSE1]>; +def FeatureSSE3 : SubtargetFeature<"sse3", "X86SSELevel", "SSE3", + "Enable SSE3 instructions", + [FeatureSSE2]>; +def FeatureSSSE3 : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3", + "Enable SSSE3 instructions", + [FeatureSSE3]>; +def FeatureSSE41 : SubtargetFeature<"sse41", "X86SSELevel", "SSE41", + "Enable SSE 4.1 instructions", + [FeatureSSSE3]>; +def FeatureSSE42 : SubtargetFeature<"sse42", "X86SSELevel", "SSE42", + "Enable SSE 4.2 instructions", + [FeatureSSE41]>; +def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow", + "Enable 3DNow! instructions", + [FeatureMMX]>; +def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA", + "Enable 3DNow! Athlon instructions", + [Feature3DNow]>; +// All x86-64 hardware has SSE2, but we don't mark SSE2 as an implied +// feature, because SSE2 can be disabled (e.g. for compiling OS kernels) +// without disabling 64-bit mode. +def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true", + "Support 64-bit instructions", + [FeatureCMOV]>; +def FeatureCMPXCHG16B : SubtargetFeature<"cmpxchg16b", "HasCmpxchg16b", "true", + "64-bit with cmpxchg16b", + [Feature64Bit]>; +def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true", + "Bit testing of memory is slow">; +def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem", + "IsUAMemFast", "true", + "Fast unaligned memory access">; +def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true", + "Support SSE 4a instructions", + [FeatureSSE3]>; + +def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX", + "Enable AVX instructions", + [FeatureSSE42]>; +def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2", + "Enable AVX2 instructions", + [FeatureAVX]>; +def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true", + "Enable packed carry-less multiplication instructions", + [FeatureSSE2]>; +def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true", + "Enable three-operand fused multiple-add", + [FeatureAVX]>; +def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true", + "Enable four-operand fused multiple-add", + [FeatureAVX, FeatureSSE4A]>; +def FeatureXOP : SubtargetFeature<"xop", "HasXOP", "true", + "Enable XOP instructions", + [FeatureFMA4]>; +def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem", + "HasVectorUAMem", "true", + "Allow unaligned memory operands on vector/SIMD instructions">; +def FeatureAES : SubtargetFeature<"aes", "HasAES", "true", + "Enable AES instructions", + [FeatureSSE2]>; +def FeatureMOVBE : SubtargetFeature<"movbe", "HasMOVBE", "true", + "Support MOVBE instruction">; +def FeatureRDRAND : SubtargetFeature<"rdrand", "HasRDRAND", "true", + "Support RDRAND instruction">; +def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true", + "Support 16-bit floating point conversion instructions">; +def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true", + "Support FS/GS Base instructions">; +def FeatureLZCNT : SubtargetFeature<"lzcnt", "HasLZCNT", "true", + "Support LZCNT instruction">; +def FeatureBMI : SubtargetFeature<"bmi", "HasBMI", "true", + "Support BMI instructions">; +def FeatureBMI2 : SubtargetFeature<"bmi2", "HasBMI2", "true", + "Support BMI2 instructions">; +def FeatureRTM : SubtargetFeature<"rtm", "HasRTM", "true", + "Support RTM instructions">; +def FeatureHLE : SubtargetFeature<"hle", "HasHLE", "true", + "Support HLE">; +def FeatureADX : SubtargetFeature<"adx", "HasADX", "true", + "Support ADX instructions">; +def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true", + "Support PRFCHW instructions">; +def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true", + "Support RDSEED instruction">; +def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", + "Use LEA for adjusting the stack pointer">; +def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb", + "HasSlowDivide", "true", + "Use small divide for positive values less than 256">; +def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions", + "PadShortFunctions", "true", + "Pad short functions">; +def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect", + "CallRegIndirect", "true", + "Call register indirect">; +def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true", + "LEA instruction needs inputs at AG stage">; + +//===----------------------------------------------------------------------===// +// X86 processors supported. +//===----------------------------------------------------------------------===// + +include "X86Schedule.td" + +def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom", + "Intel Atom processors">; + +class Proc<string Name, list<SubtargetFeature> Features> + : ProcessorModel<Name, GenericModel, Features>; + +def : Proc<"generic", []>; +def : Proc<"i386", []>; +def : Proc<"i486", []>; +def : Proc<"i586", []>; +def : Proc<"pentium", []>; +def : Proc<"pentium-mmx", [FeatureMMX]>; +def : Proc<"i686", []>; +def : Proc<"pentiumpro", [FeatureCMOV]>; +def : Proc<"pentium2", [FeatureMMX, FeatureCMOV]>; +def : Proc<"pentium3", [FeatureSSE1]>; +def : Proc<"pentium3m", [FeatureSSE1, FeatureSlowBTMem]>; +def : Proc<"pentium-m", [FeatureSSE2, FeatureSlowBTMem]>; +def : Proc<"pentium4", [FeatureSSE2]>; +def : Proc<"pentium4m", [FeatureSSE2, FeatureSlowBTMem]>; +def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem, + FeatureFastUAMem]>; +// Intel Core Duo. +def : ProcessorModel<"yonah", SandyBridgeModel, + [FeatureSSE3, FeatureSlowBTMem]>; + +// NetBurst. +def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>; +def : Proc<"nocona", [FeatureSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>; + +// Intel Core 2 Solo/Duo. +def : ProcessorModel<"core2", SandyBridgeModel, + [FeatureSSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>; +def : ProcessorModel<"penryn", SandyBridgeModel, + [FeatureSSE41, FeatureCMPXCHG16B, FeatureSlowBTMem]>; + +// Atom. +def : ProcessorModel<"atom", AtomModel, + [ProcIntelAtom, FeatureSSSE3, FeatureCMPXCHG16B, + FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP, + FeatureSlowDivide, + FeatureCallRegIndirect, + FeatureLEAUsesAG, + FeaturePadShortFunctions]>; + +// "Arrandale" along with corei3 and corei5 +def : ProcessorModel<"corei7", SandyBridgeModel, + [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem, + FeatureFastUAMem, FeaturePOPCNT, FeatureAES]>; + +def : ProcessorModel<"nehalem", SandyBridgeModel, + [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem, + FeatureFastUAMem, FeaturePOPCNT]>; +// Westmere is a similar machine to nehalem with some additional features. +// Westmere is the corei3/i5/i7 path from nehalem to sandybridge +def : ProcessorModel<"westmere", SandyBridgeModel, + [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem, + FeatureFastUAMem, FeaturePOPCNT, FeatureAES, + FeaturePCLMUL]>; +// Sandy Bridge +// SSE is not listed here since llvm treats AVX as a reimplementation of SSE, +// rather than a superset. +def : ProcessorModel<"corei7-avx", SandyBridgeModel, + [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem, + FeaturePOPCNT, FeatureAES, FeaturePCLMUL]>; +// Ivy Bridge +def : ProcessorModel<"core-avx-i", SandyBridgeModel, + [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem, + FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND, + FeatureF16C, FeatureFSGSBase]>; + +// Haswell +def : ProcessorModel<"core-avx2", HaswellModel, + [FeatureAVX2, FeatureCMPXCHG16B, FeatureFastUAMem, + FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND, + FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, + FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM, + FeatureHLE]>; + +def : Proc<"k6", [FeatureMMX]>; +def : Proc<"k6-2", [Feature3DNow]>; +def : Proc<"k6-3", [Feature3DNow]>; +def : Proc<"athlon", [Feature3DNowA, FeatureSlowBTMem]>; +def : Proc<"athlon-tbird", [Feature3DNowA, FeatureSlowBTMem]>; +def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>; +def : Proc<"athlon-xp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>; +def : Proc<"athlon-mp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>; +def : Proc<"k8", [FeatureSSE2, Feature3DNowA, Feature64Bit, + FeatureSlowBTMem]>; +def : Proc<"opteron", [FeatureSSE2, Feature3DNowA, Feature64Bit, + FeatureSlowBTMem]>; +def : Proc<"athlon64", [FeatureSSE2, Feature3DNowA, Feature64Bit, + FeatureSlowBTMem]>; +def : Proc<"athlon-fx", [FeatureSSE2, Feature3DNowA, Feature64Bit, + FeatureSlowBTMem]>; +def : Proc<"k8-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, + FeatureSlowBTMem]>; +def : Proc<"opteron-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, + FeatureSlowBTMem]>; +def : Proc<"athlon64-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, + FeatureSlowBTMem]>; +def : Proc<"amdfam10", [FeatureSSE4A, + Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT, + FeaturePOPCNT, FeatureSlowBTMem]>; +// Bobcat +def : Proc<"btver1", [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B, + FeatureLZCNT, FeaturePOPCNT]>; +// Jaguar +def : Proc<"btver2", [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B, + FeatureAES, FeaturePCLMUL, FeatureBMI, + FeatureF16C, FeatureMOVBE, FeatureLZCNT, + FeaturePOPCNT]>; +// Bulldozer +def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, + FeatureAES, FeaturePCLMUL, + FeatureLZCNT, FeaturePOPCNT]>; +// Piledriver +def : Proc<"bdver2", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, + FeatureAES, FeaturePCLMUL, + FeatureF16C, FeatureLZCNT, + FeaturePOPCNT, FeatureBMI, FeatureFMA]>; +def : Proc<"geode", [Feature3DNowA]>; + +def : Proc<"winchip-c6", [FeatureMMX]>; +def : Proc<"winchip2", [Feature3DNow]>; +def : Proc<"c3", [Feature3DNow]>; +def : Proc<"c3-2", [FeatureSSE1]>; + +//===----------------------------------------------------------------------===// +// Register File Description +//===----------------------------------------------------------------------===// + +include "X86RegisterInfo.td" + +//===----------------------------------------------------------------------===// +// Instruction Descriptions +//===----------------------------------------------------------------------===// + +include "X86InstrInfo.td" + +def X86InstrInfo : InstrInfo; + +//===----------------------------------------------------------------------===// +// Calling Conventions +//===----------------------------------------------------------------------===// + +include "X86CallingConv.td" + + +//===----------------------------------------------------------------------===// +// Assembly Parser +//===----------------------------------------------------------------------===// + +def ATTAsmParser : AsmParser { + string AsmParserClassName = "AsmParser"; +} + +def ATTAsmParserVariant : AsmParserVariant { + int Variant = 0; + + // Variant name. + string Name = "att"; + + // Discard comments in assembly strings. + string CommentDelimiter = "#"; + + // Recognize hard coded registers. + string RegisterPrefix = "%"; +} + +def IntelAsmParserVariant : AsmParserVariant { + int Variant = 1; + + // Variant name. + string Name = "intel"; + + // Discard comments in assembly strings. + string CommentDelimiter = ";"; + + // Recognize hard coded registers. + string RegisterPrefix = ""; +} + +//===----------------------------------------------------------------------===// +// Assembly Printers +//===----------------------------------------------------------------------===// + +// The X86 target supports two different syntaxes for emitting machine code. +// This is controlled by the -x86-asm-syntax={att|intel} +def ATTAsmWriter : AsmWriter { + string AsmWriterClassName = "ATTInstPrinter"; + int Variant = 0; + bit isMCAsmWriter = 1; +} +def IntelAsmWriter : AsmWriter { + string AsmWriterClassName = "IntelInstPrinter"; + int Variant = 1; + bit isMCAsmWriter = 1; +} + +def X86 : Target { + // Information about the instructions... + let InstructionSet = X86InstrInfo; + let AssemblyParsers = [ATTAsmParser]; + let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant]; + let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter]; +} diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp new file mode 100644 index 0000000..6b228b0 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp @@ -0,0 +1,755 @@ +//===-- X86AsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a printer that converts from our internal representation +// of machine-dependent LLVM code to X86 machine code. +// +//===----------------------------------------------------------------------===// + +#include "X86AsmPrinter.h" +#include "InstPrinter/X86ATTInstPrinter.h" +#include "X86.h" +#include "X86COFFMachineModuleInfo.h" +#include "X86MachineFunctionInfo.h" +#include "X86TargetMachine.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/Assembly/Writer.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/DebugInfo.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/COFF.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/Mangler.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Primitive Helper Functions. +//===----------------------------------------------------------------------===// + +/// runOnMachineFunction - Emit the function body. +/// +bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) { + SetupMachineFunction(MF); + + if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) { + bool Intrn = MF.getFunction()->hasInternalLinkage(); + OutStreamer.BeginCOFFSymbolDef(CurrentFnSym); + OutStreamer.EmitCOFFSymbolStorageClass(Intrn ? COFF::IMAGE_SYM_CLASS_STATIC + : COFF::IMAGE_SYM_CLASS_EXTERNAL); + OutStreamer.EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION + << COFF::SCT_COMPLEX_TYPE_SHIFT); + OutStreamer.EndCOFFSymbolDef(); + } + + // Have common code print out the function header with linkage info etc. + EmitFunctionHeader(); + + // Emit the rest of the function body. + EmitFunctionBody(); + + // We didn't modify anything. + return false; +} + +/// printSymbolOperand - Print a raw symbol reference operand. This handles +/// jump tables, constant pools, global address and external symbols, all of +/// which print to a label with various suffixes for relocation types etc. +void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO, + raw_ostream &O) { + switch (MO.getType()) { + default: llvm_unreachable("unknown symbol type!"); + case MachineOperand::MO_JumpTableIndex: + O << *GetJTISymbol(MO.getIndex()); + break; + case MachineOperand::MO_ConstantPoolIndex: + O << *GetCPISymbol(MO.getIndex()); + printOffset(MO.getOffset(), O); + break; + case MachineOperand::MO_GlobalAddress: { + const GlobalValue *GV = MO.getGlobal(); + + MCSymbol *GVSym; + if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) + GVSym = GetSymbolWithGlobalValueBase(GV, "$stub"); + else if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY || + MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE || + MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE) + GVSym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + else + GVSym = Mang->getSymbol(GV); + + // Handle dllimport linkage. + if (MO.getTargetFlags() == X86II::MO_DLLIMPORT) + GVSym = OutContext.GetOrCreateSymbol(Twine("__imp_") + GVSym->getName()); + + if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY || + MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) { + MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + MachineModuleInfoImpl::StubValueTy &StubSym = + MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym); + if (StubSym.getPointer() == 0) + StubSym = MachineModuleInfoImpl:: + StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage()); + } else if (MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE){ + MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + MachineModuleInfoImpl::StubValueTy &StubSym = + MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(Sym); + if (StubSym.getPointer() == 0) + StubSym = MachineModuleInfoImpl:: + StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage()); + } else if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) { + MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$stub"); + MachineModuleInfoImpl::StubValueTy &StubSym = + MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym); + if (StubSym.getPointer() == 0) + StubSym = MachineModuleInfoImpl:: + StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage()); + } + + // If the name begins with a dollar-sign, enclose it in parens. We do this + // to avoid having it look like an integer immediate to the assembler. + if (GVSym->getName()[0] != '$') + O << *GVSym; + else + O << '(' << *GVSym << ')'; + printOffset(MO.getOffset(), O); + break; + } + case MachineOperand::MO_ExternalSymbol: { + const MCSymbol *SymToPrint; + if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) { + SmallString<128> TempNameStr; + TempNameStr += StringRef(MO.getSymbolName()); + TempNameStr += StringRef("$stub"); + + MCSymbol *Sym = GetExternalSymbolSymbol(TempNameStr.str()); + MachineModuleInfoImpl::StubValueTy &StubSym = + MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym); + if (StubSym.getPointer() == 0) { + TempNameStr.erase(TempNameStr.end()-5, TempNameStr.end()); + StubSym = MachineModuleInfoImpl:: + StubValueTy(OutContext.GetOrCreateSymbol(TempNameStr.str()), + true); + } + SymToPrint = StubSym.getPointer(); + } else { + SymToPrint = GetExternalSymbolSymbol(MO.getSymbolName()); + } + + // If the name begins with a dollar-sign, enclose it in parens. We do this + // to avoid having it look like an integer immediate to the assembler. + if (SymToPrint->getName()[0] != '$') + O << *SymToPrint; + else + O << '(' << *SymToPrint << '('; + break; + } + } + + switch (MO.getTargetFlags()) { + default: + llvm_unreachable("Unknown target flag on GV operand"); + case X86II::MO_NO_FLAG: // No flag. + break; + case X86II::MO_DARWIN_NONLAZY: + case X86II::MO_DLLIMPORT: + case X86II::MO_DARWIN_STUB: + // These affect the name of the symbol, not any suffix. + break; + case X86II::MO_GOT_ABSOLUTE_ADDRESS: + O << " + [.-" << *MF->getPICBaseSymbol() << ']'; + break; + case X86II::MO_PIC_BASE_OFFSET: + case X86II::MO_DARWIN_NONLAZY_PIC_BASE: + case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: + O << '-' << *MF->getPICBaseSymbol(); + break; + case X86II::MO_TLSGD: O << "@TLSGD"; break; + case X86II::MO_TLSLD: O << "@TLSLD"; break; + case X86II::MO_TLSLDM: O << "@TLSLDM"; break; + case X86II::MO_GOTTPOFF: O << "@GOTTPOFF"; break; + case X86II::MO_INDNTPOFF: O << "@INDNTPOFF"; break; + case X86II::MO_TPOFF: O << "@TPOFF"; break; + case X86II::MO_DTPOFF: O << "@DTPOFF"; break; + case X86II::MO_NTPOFF: O << "@NTPOFF"; break; + case X86II::MO_GOTNTPOFF: O << "@GOTNTPOFF"; break; + case X86II::MO_GOTPCREL: O << "@GOTPCREL"; break; + case X86II::MO_GOT: O << "@GOT"; break; + case X86II::MO_GOTOFF: O << "@GOTOFF"; break; + case X86II::MO_PLT: O << "@PLT"; break; + case X86II::MO_TLVP: O << "@TLVP"; break; + case X86II::MO_TLVP_PIC_BASE: + O << "@TLVP" << '-' << *MF->getPICBaseSymbol(); + break; + case X86II::MO_SECREL: O << "@SECREL32"; break; + } +} + +/// printPCRelImm - This is used to print an immediate value that ends up +/// being encoded as a pc-relative value. These print slightly differently, for +/// example, a $ is not emitted. +void X86AsmPrinter::printPCRelImm(const MachineInstr *MI, unsigned OpNo, + raw_ostream &O) { + const MachineOperand &MO = MI->getOperand(OpNo); + switch (MO.getType()) { + default: llvm_unreachable("Unknown pcrel immediate operand"); + case MachineOperand::MO_Register: + // pc-relativeness was handled when computing the value in the reg. + printOperand(MI, OpNo, O); + return; + case MachineOperand::MO_Immediate: + O << MO.getImm(); + return; + case MachineOperand::MO_MachineBasicBlock: + O << *MO.getMBB()->getSymbol(); + return; + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_ExternalSymbol: + printSymbolOperand(MO, O); + return; + } +} + + +void X86AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, + raw_ostream &O, const char *Modifier, + unsigned AsmVariant) { + const MachineOperand &MO = MI->getOperand(OpNo); + switch (MO.getType()) { + default: llvm_unreachable("unknown operand type!"); + case MachineOperand::MO_Register: { + // FIXME: Enumerating AsmVariant, so we can remove magic number. + if (AsmVariant == 0) O << '%'; + unsigned Reg = MO.getReg(); + if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) { + MVT::SimpleValueType VT = (strcmp(Modifier+6,"64") == 0) ? + MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 : + ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8)); + Reg = getX86SubSuperRegister(Reg, VT); + } + O << X86ATTInstPrinter::getRegisterName(Reg); + return; + } + + case MachineOperand::MO_Immediate: + if (AsmVariant == 0) O << '$'; + O << MO.getImm(); + return; + + case MachineOperand::MO_JumpTableIndex: + case MachineOperand::MO_ConstantPoolIndex: + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_ExternalSymbol: { + if (AsmVariant == 0) O << '$'; + printSymbolOperand(MO, O); + break; + } + } +} + +void X86AsmPrinter::printLeaMemReference(const MachineInstr *MI, unsigned Op, + raw_ostream &O, const char *Modifier) { + const MachineOperand &BaseReg = MI->getOperand(Op); + const MachineOperand &IndexReg = MI->getOperand(Op+2); + const MachineOperand &DispSpec = MI->getOperand(Op+3); + + // If we really don't want to print out (rip), don't. + bool HasBaseReg = BaseReg.getReg() != 0; + if (HasBaseReg && Modifier && !strcmp(Modifier, "no-rip") && + BaseReg.getReg() == X86::RIP) + HasBaseReg = false; + + // HasParenPart - True if we will print out the () part of the mem ref. + bool HasParenPart = IndexReg.getReg() || HasBaseReg; + + if (DispSpec.isImm()) { + int DispVal = DispSpec.getImm(); + if (DispVal || !HasParenPart) + O << DispVal; + } else { + assert(DispSpec.isGlobal() || DispSpec.isCPI() || + DispSpec.isJTI() || DispSpec.isSymbol()); + printSymbolOperand(MI->getOperand(Op+3), O); + } + + if (Modifier && strcmp(Modifier, "H") == 0) + O << "+8"; + + if (HasParenPart) { + assert(IndexReg.getReg() != X86::ESP && + "X86 doesn't allow scaling by ESP"); + + O << '('; + if (HasBaseReg) + printOperand(MI, Op, O, Modifier); + + if (IndexReg.getReg()) { + O << ','; + printOperand(MI, Op+2, O, Modifier); + unsigned ScaleVal = MI->getOperand(Op+1).getImm(); + if (ScaleVal != 1) + O << ',' << ScaleVal; + } + O << ')'; + } +} + +void X86AsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op, + raw_ostream &O, const char *Modifier) { + assert(isMem(MI, Op) && "Invalid memory reference!"); + const MachineOperand &Segment = MI->getOperand(Op+4); + if (Segment.getReg()) { + printOperand(MI, Op+4, O, Modifier); + O << ':'; + } + printLeaMemReference(MI, Op, O, Modifier); +} + +void X86AsmPrinter::printIntelMemReference(const MachineInstr *MI, unsigned Op, + raw_ostream &O, const char *Modifier, + unsigned AsmVariant){ + const MachineOperand &BaseReg = MI->getOperand(Op); + unsigned ScaleVal = MI->getOperand(Op+1).getImm(); + const MachineOperand &IndexReg = MI->getOperand(Op+2); + const MachineOperand &DispSpec = MI->getOperand(Op+3); + const MachineOperand &SegReg = MI->getOperand(Op+4); + + // If this has a segment register, print it. + if (SegReg.getReg()) { + printOperand(MI, Op+4, O, Modifier, AsmVariant); + O << ':'; + } + + O << '['; + + bool NeedPlus = false; + if (BaseReg.getReg()) { + printOperand(MI, Op, O, Modifier, AsmVariant); + NeedPlus = true; + } + + if (IndexReg.getReg()) { + if (NeedPlus) O << " + "; + if (ScaleVal != 1) + O << ScaleVal << '*'; + printOperand(MI, Op+2, O, Modifier, AsmVariant); + NeedPlus = true; + } + + if (!DispSpec.isImm()) { + if (NeedPlus) O << " + "; + printOperand(MI, Op+3, O, Modifier, AsmVariant); + } else { + int64_t DispVal = DispSpec.getImm(); + if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) { + if (NeedPlus) { + if (DispVal > 0) + O << " + "; + else { + O << " - "; + DispVal = -DispVal; + } + } + O << DispVal; + } + } + O << ']'; +} + +bool X86AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode, + raw_ostream &O) { + unsigned Reg = MO.getReg(); + switch (Mode) { + default: return true; // Unknown mode. + case 'b': // Print QImode register + Reg = getX86SubSuperRegister(Reg, MVT::i8); + break; + case 'h': // Print QImode high register + Reg = getX86SubSuperRegister(Reg, MVT::i8, true); + break; + case 'w': // Print HImode register + Reg = getX86SubSuperRegister(Reg, MVT::i16); + break; + case 'k': // Print SImode register + Reg = getX86SubSuperRegister(Reg, MVT::i32); + break; + case 'q': // Print DImode register + Reg = getX86SubSuperRegister(Reg, MVT::i64); + break; + } + + O << '%' << X86ATTInstPrinter::getRegisterName(Reg); + return false; +} + +/// PrintAsmOperand - Print out an operand for an inline asm expression. +/// +bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode, raw_ostream &O) { + // Does this asm operand have a single letter operand modifier? + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + const MachineOperand &MO = MI->getOperand(OpNo); + + switch (ExtraCode[0]) { + default: + // See if this is a generic print operand + return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); + case 'a': // This is an address. Currently only 'i' and 'r' are expected. + if (MO.isImm()) { + O << MO.getImm(); + return false; + } + if (MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isSymbol()) { + printSymbolOperand(MO, O); + if (Subtarget->isPICStyleRIPRel()) + O << "(%rip)"; + return false; + } + if (MO.isReg()) { + O << '('; + printOperand(MI, OpNo, O); + O << ')'; + return false; + } + return true; + + case 'c': // Don't print "$" before a global var name or constant. + if (MO.isImm()) + O << MO.getImm(); + else if (MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isSymbol()) + printSymbolOperand(MO, O); + else + printOperand(MI, OpNo, O); + return false; + + case 'A': // Print '*' before a register (it must be a register) + if (MO.isReg()) { + O << '*'; + printOperand(MI, OpNo, O); + return false; + } + return true; + + case 'b': // Print QImode register + case 'h': // Print QImode high register + case 'w': // Print HImode register + case 'k': // Print SImode register + case 'q': // Print DImode register + if (MO.isReg()) + return printAsmMRegister(MO, ExtraCode[0], O); + printOperand(MI, OpNo, O); + return false; + + case 'P': // This is the operand of a call, treat specially. + printPCRelImm(MI, OpNo, O); + return false; + + case 'n': // Negate the immediate or print a '-' before the operand. + // Note: this is a temporary solution. It should be handled target + // independently as part of the 'MC' work. + if (MO.isImm()) { + O << -MO.getImm(); + return false; + } + O << '-'; + } + } + + printOperand(MI, OpNo, O, /*Modifier*/ 0, AsmVariant); + return false; +} + +bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, + unsigned OpNo, unsigned AsmVariant, + const char *ExtraCode, + raw_ostream &O) { + if (AsmVariant) { + printIntelMemReference(MI, OpNo, O); + return false; + } + + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + switch (ExtraCode[0]) { + default: return true; // Unknown modifier. + case 'b': // Print QImode register + case 'h': // Print QImode high register + case 'w': // Print HImode register + case 'k': // Print SImode register + case 'q': // Print SImode register + // These only apply to registers, ignore on mem. + break; + case 'H': + printMemReference(MI, OpNo, O, "H"); + return false; + case 'P': // Don't print @PLT, but do print as memory. + printMemReference(MI, OpNo, O, "no-rip"); + return false; + } + } + printMemReference(MI, OpNo, O); + return false; +} + +void X86AsmPrinter::EmitStartOfAsmFile(Module &M) { + if (Subtarget->isTargetEnvMacho()) + OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); +} + + +void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { + if (Subtarget->isTargetEnvMacho()) { + // All darwin targets use mach-o. + MachineModuleInfoMachO &MMIMacho = + MMI->getObjFileInfo<MachineModuleInfoMachO>(); + + // Output stubs for dynamically-linked functions. + MachineModuleInfoMachO::SymbolListTy Stubs; + + Stubs = MMIMacho.GetFnStubList(); + if (!Stubs.empty()) { + const MCSection *TheSection = + OutContext.getMachOSection("__IMPORT", "__jump_table", + MCSectionMachO::S_SYMBOL_STUBS | + MCSectionMachO::S_ATTR_SELF_MODIFYING_CODE | + MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS, + 5, SectionKind::getMetadata()); + OutStreamer.SwitchSection(TheSection); + + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { + // L_foo$stub: + OutStreamer.EmitLabel(Stubs[i].first); + // .indirect_symbol _foo + OutStreamer.EmitSymbolAttribute(Stubs[i].second.getPointer(), + MCSA_IndirectSymbol); + // hlt; hlt; hlt; hlt; hlt hlt = 0xf4. + const char HltInsts[] = "\xf4\xf4\xf4\xf4\xf4"; + OutStreamer.EmitBytes(StringRef(HltInsts, 5)); + } + + Stubs.clear(); + OutStreamer.AddBlankLine(); + } + + // Output stubs for external and common global variables. + Stubs = MMIMacho.GetGVStubList(); + if (!Stubs.empty()) { + const MCSection *TheSection = + OutContext.getMachOSection("__IMPORT", "__pointers", + MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS, + SectionKind::getMetadata()); + OutStreamer.SwitchSection(TheSection); + + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { + // L_foo$non_lazy_ptr: + OutStreamer.EmitLabel(Stubs[i].first); + // .indirect_symbol _foo + MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second; + OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), + MCSA_IndirectSymbol); + // .long 0 + if (MCSym.getInt()) + // External to current translation unit. + OutStreamer.EmitIntValue(0, 4/*size*/); + else + // Internal to current translation unit. + // + // When we place the LSDA into the TEXT section, the type info + // pointers need to be indirect and pc-rel. We accomplish this by + // using NLPs. However, sometimes the types are local to the file. So + // we need to fill in the value for the NLP in those cases. + OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(), + OutContext), 4/*size*/); + } + Stubs.clear(); + OutStreamer.AddBlankLine(); + } + + Stubs = MMIMacho.GetHiddenGVStubList(); + if (!Stubs.empty()) { + OutStreamer.SwitchSection(getObjFileLowering().getDataSection()); + EmitAlignment(2); + + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { + // L_foo$non_lazy_ptr: + OutStreamer.EmitLabel(Stubs[i].first); + // .long _foo + OutStreamer.EmitValue(MCSymbolRefExpr:: + Create(Stubs[i].second.getPointer(), + OutContext), 4/*size*/); + } + Stubs.clear(); + OutStreamer.AddBlankLine(); + } + + // Funny Darwin hack: This flag tells the linker that no global symbols + // contain code that falls through to other global symbols (e.g. the obvious + // implementation of multiple entry points). If this doesn't occur, the + // linker can safely perform dead code stripping. Since LLVM never + // generates code that does this, it is always safe to set. + OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); + } + + if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing() && + MMI->usesVAFloatArgument()) { + StringRef SymbolName = Subtarget->is64Bit() ? "_fltused" : "__fltused"; + MCSymbol *S = MMI->getContext().GetOrCreateSymbol(SymbolName); + OutStreamer.EmitSymbolAttribute(S, MCSA_Global); + } + + if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) { + X86COFFMachineModuleInfo &COFFMMI = + MMI->getObjFileInfo<X86COFFMachineModuleInfo>(); + + // Emit type information for external functions + typedef X86COFFMachineModuleInfo::externals_iterator externals_iterator; + for (externals_iterator I = COFFMMI.externals_begin(), + E = COFFMMI.externals_end(); + I != E; ++I) { + OutStreamer.BeginCOFFSymbolDef(CurrentFnSym); + OutStreamer.EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_EXTERNAL); + OutStreamer.EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION + << COFF::SCT_COMPLEX_TYPE_SHIFT); + OutStreamer.EndCOFFSymbolDef(); + } + + // Necessary for dllexport support + std::vector<const MCSymbol*> DLLExportedFns, DLLExportedGlobals; + + const TargetLoweringObjectFileCOFF &TLOFCOFF = + static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering()); + + for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I) + if (I->hasDLLExportLinkage()) + DLLExportedFns.push_back(Mang->getSymbol(I)); + + for (Module::const_global_iterator I = M.global_begin(), + E = M.global_end(); I != E; ++I) + if (I->hasDLLExportLinkage()) + DLLExportedGlobals.push_back(Mang->getSymbol(I)); + + // Output linker support code for dllexported globals on windows. + if (!DLLExportedGlobals.empty() || !DLLExportedFns.empty()) { + OutStreamer.SwitchSection(TLOFCOFF.getDrectveSection()); + SmallString<128> name; + for (unsigned i = 0, e = DLLExportedGlobals.size(); i != e; ++i) { + if (Subtarget->isTargetWindows()) + name = " /EXPORT:"; + else + name = " -export:"; + name += DLLExportedGlobals[i]->getName(); + if (Subtarget->isTargetWindows()) + name += ",DATA"; + else + name += ",data"; + OutStreamer.EmitBytes(name); + } + + for (unsigned i = 0, e = DLLExportedFns.size(); i != e; ++i) { + if (Subtarget->isTargetWindows()) + name = " /EXPORT:"; + else + name = " -export:"; + name += DLLExportedFns[i]->getName(); + OutStreamer.EmitBytes(name); + } + } + } + + if (Subtarget->isTargetELF()) { + const TargetLoweringObjectFileELF &TLOFELF = + static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering()); + + MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>(); + + // Output stubs for external and common global variables. + MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList(); + if (!Stubs.empty()) { + OutStreamer.SwitchSection(TLOFELF.getDataRelSection()); + const DataLayout *TD = TM.getDataLayout(); + + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { + OutStreamer.EmitLabel(Stubs[i].first); + OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(), + TD->getPointerSize()); + } + Stubs.clear(); + } + } +} + +MachineLocation +X86AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const { + MachineLocation Location; + assert (MI->getNumOperands() == 7 && "Invalid no. of machine operands!"); + // Frame address. Currently handles register +- offset only. + + if (MI->getOperand(0).isReg() && MI->getOperand(3).isImm()) + Location.set(MI->getOperand(0).getReg(), MI->getOperand(3).getImm()); + else { + DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n"); + } + return Location; +} + +void X86AsmPrinter::PrintDebugValueComment(const MachineInstr *MI, + raw_ostream &O) { + // Only the target-dependent form of DBG_VALUE should get here. + // Referencing the offset and metadata as NOps-2 and NOps-1 is + // probably portable to other targets; frame pointer location is not. + unsigned NOps = MI->getNumOperands(); + assert(NOps==7); + O << '\t' << MAI->getCommentString() << "DEBUG_VALUE: "; + // cast away const; DIetc do not take const operands for some reason. + DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata())); + if (V.getContext().isSubprogram()) + O << DISubprogram(V.getContext()).getDisplayName() << ":"; + O << V.getName(); + O << " <- "; + // Frame address. Currently handles register +- offset only. + O << '['; + if (MI->getOperand(0).isReg() && MI->getOperand(0).getReg()) + printOperand(MI, 0, O); + else + O << "undef"; + O << '+'; printOperand(MI, 3, O); + O << ']'; + O << "+"; + printOperand(MI, NOps-2, O); +} + + + +//===----------------------------------------------------------------------===// +// Target Registry Stuff +//===----------------------------------------------------------------------===// + +// Force static initialization. +extern "C" void LLVMInitializeX86AsmPrinter() { + RegisterAsmPrinter<X86AsmPrinter> X(TheX86_32Target); + RegisterAsmPrinter<X86AsmPrinter> Y(TheX86_64Target); +} diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h new file mode 100644 index 0000000..bc7496b --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h @@ -0,0 +1,79 @@ +//===-- X86AsmPrinter.h - X86 implementation of AsmPrinter ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef X86ASMPRINTER_H +#define X86ASMPRINTER_H + +#include "X86.h" +#include "X86MachineFunctionInfo.h" +#include "X86TargetMachine.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/Support/Compiler.h" + +namespace llvm { + +class MCStreamer; + +class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { + const X86Subtarget *Subtarget; + public: + explicit X86AsmPrinter(TargetMachine &TM, MCStreamer &Streamer) + : AsmPrinter(TM, Streamer) { + Subtarget = &TM.getSubtarget<X86Subtarget>(); + } + + virtual const char *getPassName() const LLVM_OVERRIDE { + return "X86 Assembly / Object Emitter"; + } + + const X86Subtarget &getSubtarget() const { return *Subtarget; } + + virtual void EmitStartOfAsmFile(Module &M) LLVM_OVERRIDE; + + virtual void EmitEndOfAsmFile(Module &M) LLVM_OVERRIDE; + + virtual void EmitInstruction(const MachineInstr *MI) LLVM_OVERRIDE; + + void printSymbolOperand(const MachineOperand &MO, raw_ostream &O); + + // These methods are used by the tablegen'erated instruction printer. + void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O, + const char *Modifier = 0, unsigned AsmVariant = 0); + void printPCRelImm(const MachineInstr *MI, unsigned OpNo, raw_ostream &O); + + bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O); + virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &OS) LLVM_OVERRIDE; + virtual bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &OS) LLVM_OVERRIDE; + + void printMemReference(const MachineInstr *MI, unsigned Op, raw_ostream &O, + const char *Modifier=NULL); + void printLeaMemReference(const MachineInstr *MI, unsigned Op, raw_ostream &O, + const char *Modifier=NULL); + + void printIntelMemReference(const MachineInstr *MI, unsigned Op, + raw_ostream &O, const char *Modifier=NULL, + unsigned AsmVariant = 1); + + virtual bool runOnMachineFunction(MachineFunction &F) LLVM_OVERRIDE; + + void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS); + + virtual MachineLocation + getDebugValueLocation(const MachineInstr *MI) const LLVM_OVERRIDE; +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.cpp b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.cpp new file mode 100644 index 0000000..6a6125b --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.cpp @@ -0,0 +1,19 @@ +//===-- X86COFFMachineModuleInfo.cpp - X86 COFF MMI Impl ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is an MMI implementation for X86 COFF (windows) targets. +// +//===----------------------------------------------------------------------===// + +#include "X86COFFMachineModuleInfo.h" +using namespace llvm; + + +X86COFFMachineModuleInfo::~X86COFFMachineModuleInfo() { +} diff --git a/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.h b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.h new file mode 100644 index 0000000..0dfeb42 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.h @@ -0,0 +1,46 @@ +//===-- X86coffmachinemoduleinfo.h - X86 COFF MMI Impl ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is an MMI implementation for X86 COFF (windows) targets. +// +//===----------------------------------------------------------------------===// + +#ifndef X86COFF_MACHINEMODULEINFO_H +#define X86COFF_MACHINEMODULEINFO_H + +#include "X86MachineFunctionInfo.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/CodeGen/MachineModuleInfo.h" + +namespace llvm { + class X86MachineFunctionInfo; + class DataLayout; + +/// X86COFFMachineModuleInfo - This is a MachineModuleInfoImpl implementation +/// for X86 COFF targets. +class X86COFFMachineModuleInfo : public MachineModuleInfoImpl { + DenseSet<MCSymbol const *> Externals; +public: + X86COFFMachineModuleInfo(const MachineModuleInfo &) {} + virtual ~X86COFFMachineModuleInfo(); + + void addExternalFunction(MCSymbol* Symbol) { + Externals.insert(Symbol); + } + + typedef DenseSet<MCSymbol const *>::const_iterator externals_iterator; + externals_iterator externals_begin() const { return Externals.begin(); } + externals_iterator externals_end() const { return Externals.end(); } +}; + + + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td new file mode 100644 index 0000000..9eafbd5 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td @@ -0,0 +1,537 @@ +//===-- X86CallingConv.td - Calling Conventions X86 32/64 --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This describes the calling conventions for the X86-32 and X86-64 +// architectures. +// +//===----------------------------------------------------------------------===// + +/// CCIfSubtarget - Match if the current subtarget has a feature F. +class CCIfSubtarget<string F, CCAction A> + : CCIf<!strconcat("State.getTarget().getSubtarget<X86Subtarget>().", F), A>; + +//===----------------------------------------------------------------------===// +// Return Value Calling Conventions +//===----------------------------------------------------------------------===// + +// Return-value conventions common to all X86 CC's. +def RetCC_X86Common : CallingConv<[ + // Scalar values are returned in AX first, then DX. For i8, the ABI + // requires the values to be in AL and AH, however this code uses AL and DL + // instead. This is because using AH for the second register conflicts with + // the way LLVM does multiple return values -- a return of {i16,i8} would end + // up in AX and AH, which overlap. Front-ends wishing to conform to the ABI + // for functions that return two i8 values are currently expected to pack the + // values into an i16 (which uses AX, and thus AL:AH). + // + // For code that doesn't care about the ABI, we allow returning more than two + // integer values in registers. + CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>, + CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>, + CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>, + CCIfType<[i64], CCAssignToReg<[RAX, RDX, RCX]>>, + + // Vector types are returned in XMM0 and XMM1, when they fit. XMM2 and XMM3 + // can only be used by ABI non-compliant code. If the target doesn't have XMM + // registers, it won't have vector types. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, + + // 256-bit vectors are returned in YMM0 and XMM1, when they fit. YMM2 and YMM3 + // can only be used by ABI non-compliant code. This vector type is only + // supported while using the AVX target feature. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, + + // MMX vector types are always returned in MM0. If the target doesn't have + // MM0, it doesn't support these vector types. + CCIfType<[x86mmx], CCAssignToReg<[MM0]>>, + + // Long double types are always returned in ST0 (even with SSE). + CCIfType<[f80], CCAssignToReg<[ST0, ST1]>> +]>; + +// X86-32 C return-value convention. +def RetCC_X86_32_C : CallingConv<[ + // The X86-32 calling convention returns FP values in ST0, unless marked + // with "inreg" (used here to distinguish one kind of reg from another, + // weirdly; this is really the sse-regparm calling convention) in which + // case they use XMM0, otherwise it is the same as the common X86 calling + // conv. + CCIfInReg<CCIfSubtarget<"hasSSE2()", + CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>, + CCIfType<[f32,f64], CCAssignToReg<[ST0, ST1]>>, + CCDelegateTo<RetCC_X86Common> +]>; + +// X86-32 FastCC return-value convention. +def RetCC_X86_32_Fast : CallingConv<[ + // The X86-32 fastcc returns 1, 2, or 3 FP values in XMM0-2 if the target has + // SSE2. + // This can happen when a float, 2 x float, or 3 x float vector is split by + // target lowering, and is returned in 1-3 sse regs. + CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, + CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, + + // For integers, ECX can be used as an extra return register + CCIfType<[i8], CCAssignToReg<[AL, DL, CL]>>, + CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>, + CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>, + + // Otherwise, it is the same as the common X86 calling convention. + CCDelegateTo<RetCC_X86Common> +]>; + +// Intel_OCL_BI return-value convention. +def RetCC_Intel_OCL_BI : CallingConv<[ + // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3. + CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, + + // 256-bit FP vectors + // No more than 4 registers + CCIfType<[v8f32, v4f64, v8i32, v4i64], + CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, + + // i32, i64 in the standard way + CCDelegateTo<RetCC_X86Common> +]>; + +// X86-32 HiPE return-value convention. +def RetCC_X86_32_HiPE : CallingConv<[ + // Promote all types to i32 + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // Return: HP, P, VAL1, VAL2 + CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>> +]>; + +// X86-64 C return-value convention. +def RetCC_X86_64_C : CallingConv<[ + // The X86-64 calling convention always returns FP values in XMM0. + CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>, + CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>, + + // MMX vector types are always returned in XMM0. + CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>, + CCDelegateTo<RetCC_X86Common> +]>; + +// X86-Win64 C return-value convention. +def RetCC_X86_Win64_C : CallingConv<[ + // The X86-Win64 calling convention always returns __m64 values in RAX. + CCIfType<[x86mmx], CCBitConvertToType<i64>>, + + // Otherwise, everything is the same as 'normal' X86-64 C CC. + CCDelegateTo<RetCC_X86_64_C> +]>; + +// X86-64 HiPE return-value convention. +def RetCC_X86_64_HiPE : CallingConv<[ + // Promote all types to i64 + CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, + + // Return: HP, P, VAL1, VAL2 + CCIfType<[i64], CCAssignToReg<[R15, RBP, RAX, RDX]>> +]>; + +// This is the root return-value convention for the X86-32 backend. +def RetCC_X86_32 : CallingConv<[ + // If FastCC, use RetCC_X86_32_Fast. + CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>, + // If HiPE, use RetCC_X86_32_HiPE. + CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>, + + // Otherwise, use RetCC_X86_32_C. + CCDelegateTo<RetCC_X86_32_C> +]>; + +// This is the root return-value convention for the X86-64 backend. +def RetCC_X86_64 : CallingConv<[ + // HiPE uses RetCC_X86_64_HiPE + CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_64_HiPE>>, + // Mingw64 and native Win64 use Win64 CC + CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>, + + // Otherwise, drop to normal X86-64 CC + CCDelegateTo<RetCC_X86_64_C> +]>; + +// This is the return-value convention used for the entire X86 backend. +def RetCC_X86 : CallingConv<[ + + // Check if this is the Intel OpenCL built-ins calling convention + CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<RetCC_Intel_OCL_BI>>, + + CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>, + CCDelegateTo<RetCC_X86_32> +]>; + +//===----------------------------------------------------------------------===// +// X86-64 Argument Calling Conventions +//===----------------------------------------------------------------------===// + +def CC_X86_64_C : CallingConv<[ + // Handles byval parameters. + CCIfByVal<CCPassByVal<8, 8>>, + + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // The 'nest' parameter, if any, is passed in R10. + CCIfNest<CCAssignToReg<[R10]>>, + + // The first 6 integer arguments are passed in integer registers. + CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>, + CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>, + + // The first 8 MMX vector arguments are passed in XMM registers on Darwin. + CCIfType<[x86mmx], + CCIfSubtarget<"isTargetDarwin()", + CCIfSubtarget<"hasSSE2()", + CCPromoteToType<v2i64>>>>, + + // The first 8 FP/Vector arguments are passed in XMM registers. + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfSubtarget<"hasSSE1()", + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>, + + // The first 8 256-bit vector arguments are passed in YMM registers, unless + // this is a vararg function. + // FIXME: This isn't precisely correct; the x86-64 ABI document says that + // fixed arguments to vararg functions are supposed to be passed in + // registers. Actually modeling that would be a lot of work, though. + CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfSubtarget<"hasAVX()", + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, + YMM4, YMM5, YMM6, YMM7]>>>>, + + // Integer/FP values get stored in stack slots that are 8 bytes in size and + // 8-byte aligned if there are no more registers to hold them. + CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, + + // Long doubles get stack slots whose size and alignment depends on the + // subtarget. + CCIfType<[f80], CCAssignToStack<0, 0>>, + + // Vectors get 16-byte stack slots that are 16-byte aligned. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, + + // 256-bit vectors get 32-byte stack slots that are 32-byte aligned. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToStack<32, 32>> +]>; + +// Calling convention used on Win64 +def CC_X86_Win64_C : CallingConv<[ + // FIXME: Handle byval stuff. + // FIXME: Handle varargs. + + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // The 'nest' parameter, if any, is passed in R10. + CCIfNest<CCAssignToReg<[R10]>>, + + // 128 bit vectors are passed by pointer + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>, + + + // 256 bit vectors are passed by pointer + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>, + + // The first 4 MMX vector arguments are passed in GPRs. + CCIfType<[x86mmx], CCBitConvertToType<i64>>, + + // The first 4 integer arguments are passed in integer registers. + CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ], + [XMM0, XMM1, XMM2, XMM3]>>, + + // Do not pass the sret argument in RCX, the Win64 thiscall calling + // convention requires "this" to be passed in RCX. + CCIfCC<"CallingConv::X86_ThisCall", + CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[RDX , R8 , R9 ], + [XMM1, XMM2, XMM3]>>>>, + + CCIfType<[i64], CCAssignToRegWithShadow<[RCX , RDX , R8 , R9 ], + [XMM0, XMM1, XMM2, XMM3]>>, + + // The first 4 FP/Vector arguments are passed in XMM registers. + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3], + [RCX , RDX , R8 , R9 ]>>, + + // Integer/FP values get stored in stack slots that are 8 bytes in size and + // 8-byte aligned if there are no more registers to hold them. + CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, + + // Long doubles get stack slots whose size and alignment depends on the + // subtarget. + CCIfType<[f80], CCAssignToStack<0, 0>> +]>; + +def CC_X86_64_GHC : CallingConv<[ + // Promote i8/i16/i32 arguments to i64. + CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, + + // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim + CCIfType<[i64], + CCAssignToReg<[R13, RBP, R12, RBX, R14, RSI, RDI, R8, R9, R15]>>, + + // Pass in STG registers: F1, F2, F3, F4, D1, D2 + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfSubtarget<"hasSSE1()", + CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>> +]>; + +def CC_X86_64_HiPE : CallingConv<[ + // Promote i8/i16/i32 arguments to i64. + CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, + + // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2, ARG3 + CCIfType<[i64], CCAssignToReg<[R15, RBP, RSI, RDX, RCX, R8]>>, + + // Integer/FP values get stored in stack slots that are 8 bytes in size and + // 8-byte aligned if there are no more registers to hold them. + CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>> +]>; + +//===----------------------------------------------------------------------===// +// X86 C Calling Convention +//===----------------------------------------------------------------------===// + +/// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP +/// values are spilled on the stack, and the first 4 vector values go in XMM +/// regs. +def CC_X86_32_Common : CallingConv<[ + // Handles byval parameters. + CCIfByVal<CCPassByVal<4, 4>>, + + // The first 3 float or double arguments, if marked 'inreg' and if the call + // is not a vararg call and if SSE2 is available, are passed in SSE registers. + CCIfNotVarArg<CCIfInReg<CCIfType<[f32,f64], + CCIfSubtarget<"hasSSE2()", + CCAssignToReg<[XMM0,XMM1,XMM2]>>>>>, + + // The first 3 __m64 vector arguments are passed in mmx registers if the + // call is not a vararg call. + CCIfNotVarArg<CCIfType<[x86mmx], + CCAssignToReg<[MM0, MM1, MM2]>>>, + + // Integer/Float values get stored in stack slots that are 4 bytes in + // size and 4-byte aligned. + CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + + // Doubles get 8-byte slots that are 4-byte aligned. + CCIfType<[f64], CCAssignToStack<8, 4>>, + + // Long doubles get slots whose size depends on the subtarget. + CCIfType<[f80], CCAssignToStack<0, 4>>, + + // The first 4 SSE vector arguments are passed in XMM registers. + CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>, + + // The first 4 AVX 256-bit vector arguments are passed in YMM registers. + CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfSubtarget<"hasAVX()", + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>, + + // Other SSE vectors get 16-byte stack slots that are 16-byte aligned. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, + + // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToStack<32, 32>>, + + // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are + // passed in the parameter area. + CCIfType<[x86mmx], CCAssignToStack<8, 4>>]>; + +def CC_X86_32_C : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // The 'nest' parameter, if any, is passed in ECX. + CCIfNest<CCAssignToReg<[ECX]>>, + + // The first 3 integer arguments, if marked 'inreg' and if the call is not + // a vararg call, are passed in integer registers. + CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; + +def CC_X86_32_FastCall : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // The 'nest' parameter, if any, is passed in EAX. + CCIfNest<CCAssignToReg<[EAX]>>, + + // The first 2 integer arguments are passed in ECX/EDX + CCIfInReg<CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; + +def CC_X86_32_ThisCall : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // Pass sret arguments indirectly through stack. + CCIfSRet<CCAssignToStack<4, 4>>, + + // The first integer argument is passed in ECX + CCIfType<[i32], CCAssignToReg<[ECX]>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; + +def CC_X86_32_FastCC : CallingConv<[ + // Handles byval parameters. Note that we can't rely on the delegation + // to CC_X86_32_Common for this because that happens after code that + // puts arguments in registers. + CCIfByVal<CCPassByVal<4, 4>>, + + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // The 'nest' parameter, if any, is passed in EAX. + CCIfNest<CCAssignToReg<[EAX]>>, + + // The first 2 integer arguments are passed in ECX/EDX + CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>, + + // The first 3 float or double arguments, if the call is not a vararg + // call and if SSE2 is available, are passed in SSE registers. + CCIfNotVarArg<CCIfType<[f32,f64], + CCIfSubtarget<"hasSSE2()", + CCAssignToReg<[XMM0,XMM1,XMM2]>>>>, + + // Doubles get 8-byte slots that are 8-byte aligned. + CCIfType<[f64], CCAssignToStack<8, 8>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; + +def CC_X86_32_GHC : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // Pass in STG registers: Base, Sp, Hp, R1 + CCIfType<[i32], CCAssignToReg<[EBX, EBP, EDI, ESI]>> +]>; + +def CC_X86_32_HiPE : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2 + CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX, ECX]>>, + + // Integer/Float values get stored in stack slots that are 4 bytes in + // size and 4-byte aligned. + CCIfType<[i32, f32], CCAssignToStack<4, 4>> +]>; + +// X86-64 Intel OpenCL built-ins calling convention. +def CC_Intel_OCL_BI : CallingConv<[ + + CCIfType<[i32], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[ECX, EDX, R8D, R9D]>>>, + CCIfType<[i64], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[RCX, RDX, R8, R9 ]>>>, + + CCIfType<[i32], CCIfSubtarget<"is64Bit()", CCAssignToReg<[EDI, ESI, EDX, ECX]>>>, + CCIfType<[i64], CCIfSubtarget<"is64Bit()", CCAssignToReg<[RDI, RSI, RDX, RCX]>>>, + + CCIfType<[i32], CCAssignToStack<4, 4>>, + + // The SSE vector arguments are passed in XMM registers. + CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>, + + // The 256-bit vector arguments are passed in YMM registers. + CCIfType<[v8f32, v4f64, v8i32, v4i64], + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>, + + CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>, + CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64_C>>, + CCDelegateTo<CC_X86_32_C> +]>; + +//===----------------------------------------------------------------------===// +// X86 Root Argument Calling Conventions +//===----------------------------------------------------------------------===// + +// This is the root argument convention for the X86-32 backend. +def CC_X86_32 : CallingConv<[ + CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>, + CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>, + CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>, + CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>, + CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>, + + // Otherwise, drop to normal X86-32 CC + CCDelegateTo<CC_X86_32_C> +]>; + +// This is the root argument convention for the X86-64 backend. +def CC_X86_64 : CallingConv<[ + CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_64_GHC>>, + CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>, + + // Mingw64 and native Win64 use Win64 CC + CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>, + + // Otherwise, drop to normal X86-64 CC + CCDelegateTo<CC_X86_64_C> +]>; + +// This is the argument convention used for the entire X86 backend. +def CC_X86 : CallingConv<[ + CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>, + CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>, + CCDelegateTo<CC_X86_32> +]>; + +//===----------------------------------------------------------------------===// +// Callee-saved Registers. +//===----------------------------------------------------------------------===// + +def CSR_NoRegs : CalleeSavedRegs<(add)>; + +def CSR_32 : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>; +def CSR_64 : CalleeSavedRegs<(add RBX, R12, R13, R14, R15, RBP)>; + +def CSR_32EHRet : CalleeSavedRegs<(add EAX, EDX, CSR_32)>; +def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>; + +def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15, + (sequence "XMM%u", 6, 15))>; + +def CSR_MostRegs_64 : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10, + R11, R12, R13, R14, R15, RBP, + (sequence "XMM%u", 0, 15))>; + +// Standard C + YMM6-15 +def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, + R13, R14, R15, + (sequence "YMM%u", 6, 15))>; + +//Standard C + XMM 8-15 +def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64, + (sequence "XMM%u", 8, 15))>; + +//Standard C + YMM 8-15 +def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64, + (sequence "YMM%u", 8, 15))>; diff --git a/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp b/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp new file mode 100644 index 0000000..8fea6ed --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp @@ -0,0 +1,1507 @@ +//===-- X86CodeEmitter.cpp - Convert X86 code to machine code -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the pass that transforms the X86 machine instructions into +// relocatable machine code. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "x86-emitter" +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86JITInfo.h" +#include "X86Relocations.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/JITCodeEmitter.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/PassManager.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +STATISTIC(NumEmitted, "Number of machine instructions emitted"); + +namespace { + template<class CodeEmitter> + class Emitter : public MachineFunctionPass { + const X86InstrInfo *II; + const DataLayout *TD; + X86TargetMachine &TM; + CodeEmitter &MCE; + MachineModuleInfo *MMI; + intptr_t PICBaseOffset; + bool Is64BitMode; + bool IsPIC; + public: + static char ID; + explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce) + : MachineFunctionPass(ID), II(0), TD(0), TM(tm), + MCE(mce), PICBaseOffset(0), Is64BitMode(false), + IsPIC(TM.getRelocationModel() == Reloc::PIC_) {} + Emitter(X86TargetMachine &tm, CodeEmitter &mce, + const X86InstrInfo &ii, const DataLayout &td, bool is64) + : MachineFunctionPass(ID), II(&ii), TD(&td), TM(tm), + MCE(mce), PICBaseOffset(0), Is64BitMode(is64), + IsPIC(TM.getRelocationModel() == Reloc::PIC_) {} + + bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { + return "X86 Machine Code Emitter"; + } + + void emitOpcodePrefix(uint64_t TSFlags, int MemOperand, + const MachineInstr &MI, + const MCInstrDesc *Desc) const; + + void emitVEXOpcodePrefix(uint64_t TSFlags, int MemOperand, + const MachineInstr &MI, + const MCInstrDesc *Desc) const; + + void emitSegmentOverridePrefix(uint64_t TSFlags, + int MemOperand, + const MachineInstr &MI) const; + + void emitInstruction(MachineInstr &MI, const MCInstrDesc *Desc); + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired<MachineModuleInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + private: + void emitPCRelativeBlockAddress(MachineBasicBlock *MBB); + void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc, + intptr_t Disp = 0, intptr_t PCAdj = 0, + bool Indirect = false); + void emitExternalSymbolAddress(const char *ES, unsigned Reloc); + void emitConstPoolAddress(unsigned CPI, unsigned Reloc, intptr_t Disp = 0, + intptr_t PCAdj = 0); + void emitJumpTableAddress(unsigned JTI, unsigned Reloc, + intptr_t PCAdj = 0); + + void emitDisplacementField(const MachineOperand *RelocOp, int DispVal, + intptr_t Adj = 0, bool IsPCRel = true); + + void emitRegModRMByte(unsigned ModRMReg, unsigned RegOpcodeField); + void emitRegModRMByte(unsigned RegOpcodeField); + void emitSIBByte(unsigned SS, unsigned Index, unsigned Base); + void emitConstant(uint64_t Val, unsigned Size); + + void emitMemModRMByte(const MachineInstr &MI, + unsigned Op, unsigned RegOpcodeField, + intptr_t PCAdj = 0); + + unsigned getX86RegNum(unsigned RegNo) const { + const TargetRegisterInfo *TRI = TM.getRegisterInfo(); + return TRI->getEncodingValue(RegNo) & 0x7; + } + + unsigned char getVEXRegisterEncoding(const MachineInstr &MI, + unsigned OpNum) const; + }; + +template<class CodeEmitter> + char Emitter<CodeEmitter>::ID = 0; +} // end anonymous namespace. + +/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code +/// to the specified JITCodeEmitter object. +FunctionPass *llvm::createX86JITCodeEmitterPass(X86TargetMachine &TM, + JITCodeEmitter &JCE) { + return new Emitter<JITCodeEmitter>(TM, JCE); +} + +template<class CodeEmitter> +bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) { + MMI = &getAnalysis<MachineModuleInfo>(); + MCE.setModuleInfo(MMI); + + II = TM.getInstrInfo(); + TD = TM.getDataLayout(); + Is64BitMode = TM.getSubtarget<X86Subtarget>().is64Bit(); + IsPIC = TM.getRelocationModel() == Reloc::PIC_; + + do { + DEBUG(dbgs() << "JITTing function '" << MF.getName() << "'\n"); + MCE.startFunction(MF); + for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); + MBB != E; ++MBB) { + MCE.StartMachineBasicBlock(MBB); + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); + I != E; ++I) { + const MCInstrDesc &Desc = I->getDesc(); + emitInstruction(*I, &Desc); + // MOVPC32r is basically a call plus a pop instruction. + if (Desc.getOpcode() == X86::MOVPC32r) + emitInstruction(*I, &II->get(X86::POP32r)); + ++NumEmitted; // Keep track of the # of mi's emitted + } + } + } while (MCE.finishFunction(MF)); + + return false; +} + +/// determineREX - Determine if the MachineInstr has to be encoded with a X86-64 +/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand +/// size, and 3) use of X86-64 extended registers. +static unsigned determineREX(const MachineInstr &MI) { + unsigned REX = 0; + const MCInstrDesc &Desc = MI.getDesc(); + + // Pseudo instructions do not need REX prefix byte. + if ((Desc.TSFlags & X86II::FormMask) == X86II::Pseudo) + return 0; + if (Desc.TSFlags & X86II::REX_W) + REX |= 1 << 3; + + unsigned NumOps = Desc.getNumOperands(); + if (NumOps) { + bool isTwoAddr = NumOps > 1 && + Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1; + + // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix. + unsigned i = isTwoAddr ? 1 : 0; + for (unsigned e = NumOps; i != e; ++i) { + const MachineOperand& MO = MI.getOperand(i); + if (MO.isReg()) { + unsigned Reg = MO.getReg(); + if (X86II::isX86_64NonExtLowByteReg(Reg)) + REX |= 0x40; + } + } + + switch (Desc.TSFlags & X86II::FormMask) { + case X86II::MRMInitReg: + if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0))) + REX |= (1 << 0) | (1 << 2); + break; + case X86II::MRMSrcReg: { + if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0))) + REX |= 1 << 2; + i = isTwoAddr ? 2 : 1; + for (unsigned e = NumOps; i != e; ++i) { + const MachineOperand& MO = MI.getOperand(i); + if (X86InstrInfo::isX86_64ExtendedReg(MO)) + REX |= 1 << 0; + } + break; + } + case X86II::MRMSrcMem: { + if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0))) + REX |= 1 << 2; + unsigned Bit = 0; + i = isTwoAddr ? 2 : 1; + for (; i != NumOps; ++i) { + const MachineOperand& MO = MI.getOperand(i); + if (MO.isReg()) { + if (X86InstrInfo::isX86_64ExtendedReg(MO)) + REX |= 1 << Bit; + Bit++; + } + } + break; + } + case X86II::MRM0m: case X86II::MRM1m: + case X86II::MRM2m: case X86II::MRM3m: + case X86II::MRM4m: case X86II::MRM5m: + case X86II::MRM6m: case X86II::MRM7m: + case X86II::MRMDestMem: { + unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands); + i = isTwoAddr ? 1 : 0; + if (NumOps > e && X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(e))) + REX |= 1 << 2; + unsigned Bit = 0; + for (; i != e; ++i) { + const MachineOperand& MO = MI.getOperand(i); + if (MO.isReg()) { + if (X86InstrInfo::isX86_64ExtendedReg(MO)) + REX |= 1 << Bit; + Bit++; + } + } + break; + } + default: { + if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0))) + REX |= 1 << 0; + i = isTwoAddr ? 2 : 1; + for (unsigned e = NumOps; i != e; ++i) { + const MachineOperand& MO = MI.getOperand(i); + if (X86InstrInfo::isX86_64ExtendedReg(MO)) + REX |= 1 << 2; + } + break; + } + } + } + return REX; +} + + +/// emitPCRelativeBlockAddress - This method keeps track of the information +/// necessary to resolve the address of this block later and emits a dummy +/// value. +/// +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitPCRelativeBlockAddress(MachineBasicBlock *MBB) { + // Remember where this reference was and where it is to so we can + // deal with it later. + MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(), + X86::reloc_pcrel_word, MBB)); + MCE.emitWordLE(0); +} + +/// emitGlobalAddress - Emit the specified address to the code stream assuming +/// this is part of a "take the address of a global" instruction. +/// +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitGlobalAddress(const GlobalValue *GV, + unsigned Reloc, + intptr_t Disp /* = 0 */, + intptr_t PCAdj /* = 0 */, + bool Indirect /* = false */) { + intptr_t RelocCST = Disp; + if (Reloc == X86::reloc_picrel_word) + RelocCST = PICBaseOffset; + else if (Reloc == X86::reloc_pcrel_word) + RelocCST = PCAdj; + MachineRelocation MR = Indirect + ? MachineRelocation::getIndirectSymbol(MCE.getCurrentPCOffset(), Reloc, + const_cast<GlobalValue *>(GV), + RelocCST, false) + : MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc, + const_cast<GlobalValue *>(GV), RelocCST, false); + MCE.addRelocation(MR); + // The relocated value will be added to the displacement + if (Reloc == X86::reloc_absolute_dword) + MCE.emitDWordLE(Disp); + else + MCE.emitWordLE((int32_t)Disp); +} + +/// emitExternalSymbolAddress - Arrange for the address of an external symbol to +/// be emitted to the current location in the function, and allow it to be PC +/// relative. +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitExternalSymbolAddress(const char *ES, + unsigned Reloc) { + intptr_t RelocCST = (Reloc == X86::reloc_picrel_word) ? PICBaseOffset : 0; + + // X86 never needs stubs because instruction selection will always pick + // an instruction sequence that is large enough to hold any address + // to a symbol. + // (see X86ISelLowering.cpp, near 2039: X86TargetLowering::LowerCall) + bool NeedStub = false; + MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(), + Reloc, ES, RelocCST, + 0, NeedStub)); + if (Reloc == X86::reloc_absolute_dword) + MCE.emitDWordLE(0); + else + MCE.emitWordLE(0); +} + +/// emitConstPoolAddress - Arrange for the address of an constant pool +/// to be emitted to the current location in the function, and allow it to be PC +/// relative. +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitConstPoolAddress(unsigned CPI, unsigned Reloc, + intptr_t Disp /* = 0 */, + intptr_t PCAdj /* = 0 */) { + intptr_t RelocCST = 0; + if (Reloc == X86::reloc_picrel_word) + RelocCST = PICBaseOffset; + else if (Reloc == X86::reloc_pcrel_word) + RelocCST = PCAdj; + MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(), + Reloc, CPI, RelocCST)); + // The relocated value will be added to the displacement + if (Reloc == X86::reloc_absolute_dword) + MCE.emitDWordLE(Disp); + else + MCE.emitWordLE((int32_t)Disp); +} + +/// emitJumpTableAddress - Arrange for the address of a jump table to +/// be emitted to the current location in the function, and allow it to be PC +/// relative. +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitJumpTableAddress(unsigned JTI, unsigned Reloc, + intptr_t PCAdj /* = 0 */) { + intptr_t RelocCST = 0; + if (Reloc == X86::reloc_picrel_word) + RelocCST = PICBaseOffset; + else if (Reloc == X86::reloc_pcrel_word) + RelocCST = PCAdj; + MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(), + Reloc, JTI, RelocCST)); + // The relocated value will be added to the displacement + if (Reloc == X86::reloc_absolute_dword) + MCE.emitDWordLE(0); + else + MCE.emitWordLE(0); +} + +inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode, + unsigned RM) { + assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!"); + return RM | (RegOpcode << 3) | (Mod << 6); +} + +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitRegModRMByte(unsigned ModRMReg, + unsigned RegOpcodeFld){ + MCE.emitByte(ModRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg))); +} + +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitRegModRMByte(unsigned RegOpcodeFld) { + MCE.emitByte(ModRMByte(3, RegOpcodeFld, 0)); +} + +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitSIBByte(unsigned SS, + unsigned Index, + unsigned Base) { + // SIB byte is in the same format as the ModRMByte... + MCE.emitByte(ModRMByte(SS, Index, Base)); +} + +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitConstant(uint64_t Val, unsigned Size) { + // Output the constant in little endian byte order... + for (unsigned i = 0; i != Size; ++i) { + MCE.emitByte(Val & 255); + Val >>= 8; + } +} + +/// isDisp8 - Return true if this signed displacement fits in a 8-bit +/// sign-extended field. +static bool isDisp8(int Value) { + return Value == (signed char)Value; +} + +static bool gvNeedsNonLazyPtr(const MachineOperand &GVOp, + const TargetMachine &TM) { + // For Darwin-64, simulate the linktime GOT by using the same non-lazy-pointer + // mechanism as 32-bit mode. + if (TM.getSubtarget<X86Subtarget>().is64Bit() && + !TM.getSubtarget<X86Subtarget>().isTargetDarwin()) + return false; + + // Return true if this is a reference to a stub containing the address of the + // global, not the global itself. + return isGlobalStubReference(GVOp.getTargetFlags()); +} + +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitDisplacementField(const MachineOperand *RelocOp, + int DispVal, + intptr_t Adj /* = 0 */, + bool IsPCRel /* = true */) { + // If this is a simple integer displacement that doesn't require a relocation, + // emit it now. + if (!RelocOp) { + emitConstant(DispVal, 4); + return; + } + + // Otherwise, this is something that requires a relocation. Emit it as such + // now. + unsigned RelocType = Is64BitMode ? + (IsPCRel ? X86::reloc_pcrel_word : X86::reloc_absolute_word_sext) + : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word); + if (RelocOp->isGlobal()) { + // In 64-bit static small code model, we could potentially emit absolute. + // But it's probably not beneficial. If the MCE supports using RIP directly + // do it, otherwise fallback to absolute (this is determined by IsPCRel). + // 89 05 00 00 00 00 mov %eax,0(%rip) # PC-relative + // 89 04 25 00 00 00 00 mov %eax,0x0 # Absolute + bool Indirect = gvNeedsNonLazyPtr(*RelocOp, TM); + emitGlobalAddress(RelocOp->getGlobal(), RelocType, RelocOp->getOffset(), + Adj, Indirect); + } else if (RelocOp->isSymbol()) { + emitExternalSymbolAddress(RelocOp->getSymbolName(), RelocType); + } else if (RelocOp->isCPI()) { + emitConstPoolAddress(RelocOp->getIndex(), RelocType, + RelocOp->getOffset(), Adj); + } else { + assert(RelocOp->isJTI() && "Unexpected machine operand!"); + emitJumpTableAddress(RelocOp->getIndex(), RelocType, Adj); + } +} + +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI, + unsigned Op,unsigned RegOpcodeField, + intptr_t PCAdj) { + const MachineOperand &Op3 = MI.getOperand(Op+3); + int DispVal = 0; + const MachineOperand *DispForReloc = 0; + + // Figure out what sort of displacement we have to handle here. + if (Op3.isGlobal()) { + DispForReloc = &Op3; + } else if (Op3.isSymbol()) { + DispForReloc = &Op3; + } else if (Op3.isCPI()) { + if (!MCE.earlyResolveAddresses() || Is64BitMode || IsPIC) { + DispForReloc = &Op3; + } else { + DispVal += MCE.getConstantPoolEntryAddress(Op3.getIndex()); + DispVal += Op3.getOffset(); + } + } else if (Op3.isJTI()) { + if (!MCE.earlyResolveAddresses() || Is64BitMode || IsPIC) { + DispForReloc = &Op3; + } else { + DispVal += MCE.getJumpTableEntryAddress(Op3.getIndex()); + } + } else { + DispVal = Op3.getImm(); + } + + const MachineOperand &Base = MI.getOperand(Op); + const MachineOperand &Scale = MI.getOperand(Op+1); + const MachineOperand &IndexReg = MI.getOperand(Op+2); + + unsigned BaseReg = Base.getReg(); + + // Handle %rip relative addressing. + if (BaseReg == X86::RIP || + (Is64BitMode && DispForReloc)) { // [disp32+RIP] in X86-64 mode + assert(IndexReg.getReg() == 0 && Is64BitMode && + "Invalid rip-relative address"); + MCE.emitByte(ModRMByte(0, RegOpcodeField, 5)); + emitDisplacementField(DispForReloc, DispVal, PCAdj, true); + return; + } + + // Indicate that the displacement will use an pcrel or absolute reference + // by default. MCEs able to resolve addresses on-the-fly use pcrel by default + // while others, unless explicit asked to use RIP, use absolute references. + bool IsPCRel = MCE.earlyResolveAddresses() ? true : false; + + // Is a SIB byte needed? + // If no BaseReg, issue a RIP relative instruction only if the MCE can + // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table + // 2-7) and absolute references. + unsigned BaseRegNo = -1U; + if (BaseReg != 0 && BaseReg != X86::RIP) + BaseRegNo = getX86RegNum(BaseReg); + + if (// The SIB byte must be used if there is an index register. + IndexReg.getReg() == 0 && + // The SIB byte must be used if the base is ESP/RSP/R12, all of which + // encode to an R/M value of 4, which indicates that a SIB byte is + // present. + BaseRegNo != N86::ESP && + // If there is no base register and we're in 64-bit mode, we need a SIB + // byte to emit an addr that is just 'disp32' (the non-RIP relative form). + (!Is64BitMode || BaseReg != 0)) { + if (BaseReg == 0 || // [disp32] in X86-32 mode + BaseReg == X86::RIP) { // [disp32+RIP] in X86-64 mode + MCE.emitByte(ModRMByte(0, RegOpcodeField, 5)); + emitDisplacementField(DispForReloc, DispVal, PCAdj, true); + return; + } + + // If the base is not EBP/ESP and there is no displacement, use simple + // indirect register encoding, this handles addresses like [EAX]. The + // encoding for [EBP] with no displacement means [disp32] so we handle it + // by emitting a displacement of 0 below. + if (!DispForReloc && DispVal == 0 && BaseRegNo != N86::EBP) { + MCE.emitByte(ModRMByte(0, RegOpcodeField, BaseRegNo)); + return; + } + + // Otherwise, if the displacement fits in a byte, encode as [REG+disp8]. + if (!DispForReloc && isDisp8(DispVal)) { + MCE.emitByte(ModRMByte(1, RegOpcodeField, BaseRegNo)); + emitConstant(DispVal, 1); + return; + } + + // Otherwise, emit the most general non-SIB encoding: [REG+disp32] + MCE.emitByte(ModRMByte(2, RegOpcodeField, BaseRegNo)); + emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel); + return; + } + + // Otherwise we need a SIB byte, so start by outputting the ModR/M byte first. + assert(IndexReg.getReg() != X86::ESP && + IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!"); + + bool ForceDisp32 = false; + bool ForceDisp8 = false; + if (BaseReg == 0) { + // If there is no base register, we emit the special case SIB byte with + // MOD=0, BASE=4, to JUST get the index, scale, and displacement. + MCE.emitByte(ModRMByte(0, RegOpcodeField, 4)); + ForceDisp32 = true; + } else if (DispForReloc) { + // Emit the normal disp32 encoding. + MCE.emitByte(ModRMByte(2, RegOpcodeField, 4)); + ForceDisp32 = true; + } else if (DispVal == 0 && BaseRegNo != N86::EBP) { + // Emit no displacement ModR/M byte + MCE.emitByte(ModRMByte(0, RegOpcodeField, 4)); + } else if (isDisp8(DispVal)) { + // Emit the disp8 encoding... + MCE.emitByte(ModRMByte(1, RegOpcodeField, 4)); + ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP + } else { + // Emit the normal disp32 encoding... + MCE.emitByte(ModRMByte(2, RegOpcodeField, 4)); + } + + // Calculate what the SS field value should be... + static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 }; + unsigned SS = SSTable[Scale.getImm()]; + + if (BaseReg == 0) { + // Handle the SIB byte for the case where there is no base, see Intel + // Manual 2A, table 2-7. The displacement has already been output. + unsigned IndexRegNo; + if (IndexReg.getReg()) + IndexRegNo = getX86RegNum(IndexReg.getReg()); + else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5) + IndexRegNo = 4; + emitSIBByte(SS, IndexRegNo, 5); + } else { + unsigned BaseRegNo = getX86RegNum(BaseReg); + unsigned IndexRegNo; + if (IndexReg.getReg()) + IndexRegNo = getX86RegNum(IndexReg.getReg()); + else + IndexRegNo = 4; // For example [ESP+1*<noreg>+4] + emitSIBByte(SS, IndexRegNo, BaseRegNo); + } + + // Do we need to output a displacement? + if (ForceDisp8) { + emitConstant(DispVal, 1); + } else if (DispVal != 0 || ForceDisp32) { + emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel); + } +} + +static const MCInstrDesc *UpdateOp(MachineInstr &MI, const X86InstrInfo *II, + unsigned Opcode) { + const MCInstrDesc *Desc = &II->get(Opcode); + MI.setDesc(*Desc); + return Desc; +} + +/// Is16BitMemOperand - Return true if the specified instruction has +/// a 16-bit memory operand. Op specifies the operand # of the memoperand. +static bool Is16BitMemOperand(const MachineInstr &MI, unsigned Op) { + const MachineOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); + const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); + + if ((BaseReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) || + (IndexReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg()))) + return true; + return false; +} + +/// Is32BitMemOperand - Return true if the specified instruction has +/// a 32-bit memory operand. Op specifies the operand # of the memoperand. +static bool Is32BitMemOperand(const MachineInstr &MI, unsigned Op) { + const MachineOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); + const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); + + if ((BaseReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) || + (IndexReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg()))) + return true; + return false; +} + +/// Is64BitMemOperand - Return true if the specified instruction has +/// a 64-bit memory operand. Op specifies the operand # of the memoperand. +#ifndef NDEBUG +static bool Is64BitMemOperand(const MachineInstr &MI, unsigned Op) { + const MachineOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); + const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); + + if ((BaseReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) || + (IndexReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg()))) + return true; + return false; +} +#endif + +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitOpcodePrefix(uint64_t TSFlags, + int MemOperand, + const MachineInstr &MI, + const MCInstrDesc *Desc) const { + // Emit the lock opcode prefix as needed. + if (Desc->TSFlags & X86II::LOCK) + MCE.emitByte(0xF0); + + // Emit segment override opcode prefix as needed. + emitSegmentOverridePrefix(TSFlags, MemOperand, MI); + + // Emit the repeat opcode prefix as needed. + if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP) + MCE.emitByte(0xF3); + + // Emit the address size opcode prefix as needed. + bool need_address_override; + if (TSFlags & X86II::AdSize) { + need_address_override = true; + } else if (MemOperand == -1) { + need_address_override = false; + } else if (Is64BitMode) { + assert(!Is16BitMemOperand(MI, MemOperand)); + need_address_override = Is32BitMemOperand(MI, MemOperand); + } else { + assert(!Is64BitMemOperand(MI, MemOperand)); + need_address_override = Is16BitMemOperand(MI, MemOperand); + } + + if (need_address_override) + MCE.emitByte(0x67); + + // Emit the operand size opcode prefix as needed. + if (TSFlags & X86II::OpSize) + MCE.emitByte(0x66); + + bool Need0FPrefix = false; + switch (Desc->TSFlags & X86II::Op0Mask) { + case X86II::TB: // Two-byte opcode prefix + case X86II::T8: // 0F 38 + case X86II::TA: // 0F 3A + case X86II::A6: // 0F A6 + case X86II::A7: // 0F A7 + Need0FPrefix = true; + break; + case X86II::REP: break; // already handled. + case X86II::T8XS: // F3 0F 38 + case X86II::XS: // F3 0F + MCE.emitByte(0xF3); + Need0FPrefix = true; + break; + case X86II::T8XD: // F2 0F 38 + case X86II::TAXD: // F2 0F 3A + case X86II::XD: // F2 0F + MCE.emitByte(0xF2); + Need0FPrefix = true; + break; + case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB: + case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF: + MCE.emitByte(0xD8+ + (((Desc->TSFlags & X86II::Op0Mask)-X86II::D8) + >> X86II::Op0Shift)); + break; // Two-byte opcode prefix + default: llvm_unreachable("Invalid prefix!"); + case 0: break; // No prefix! + } + + // Handle REX prefix. + if (Is64BitMode) { + if (unsigned REX = determineREX(MI)) + MCE.emitByte(0x40 | REX); + } + + // 0x0F escape code must be emitted just before the opcode. + if (Need0FPrefix) + MCE.emitByte(0x0F); + + switch (Desc->TSFlags & X86II::Op0Mask) { + case X86II::T8XD: // F2 0F 38 + case X86II::T8XS: // F3 0F 38 + case X86II::T8: // 0F 38 + MCE.emitByte(0x38); + break; + case X86II::TAXD: // F2 0F 38 + case X86II::TA: // 0F 3A + MCE.emitByte(0x3A); + break; + case X86II::A6: // 0F A6 + MCE.emitByte(0xA6); + break; + case X86II::A7: // 0F A7 + MCE.emitByte(0xA7); + break; + } +} + +// On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range +// 0-7 and the difference between the 2 groups is given by the REX prefix. +// In the VEX prefix, registers are seen sequencially from 0-15 and encoded +// in 1's complement form, example: +// +// ModRM field => XMM9 => 1 +// VEX.VVVV => XMM9 => ~9 +// +// See table 4-35 of Intel AVX Programming Reference for details. +template<class CodeEmitter> +unsigned char +Emitter<CodeEmitter>::getVEXRegisterEncoding(const MachineInstr &MI, + unsigned OpNum) const { + unsigned SrcReg = MI.getOperand(OpNum).getReg(); + unsigned SrcRegNum = getX86RegNum(MI.getOperand(OpNum).getReg()); + if (X86II::isX86_64ExtendedReg(SrcReg)) + SrcRegNum |= 8; + + // The registers represented through VEX_VVVV should + // be encoded in 1's complement form. + return (~SrcRegNum) & 0xf; +} + +/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitSegmentOverridePrefix(uint64_t TSFlags, + int MemOperand, + const MachineInstr &MI) const { + switch (TSFlags & X86II::SegOvrMask) { + default: llvm_unreachable("Invalid segment!"); + case 0: + // No segment override, check for explicit one on memory operand. + if (MemOperand != -1) { // If the instruction has a memory operand. + switch (MI.getOperand(MemOperand+X86::AddrSegmentReg).getReg()) { + default: llvm_unreachable("Unknown segment register!"); + case 0: break; + case X86::CS: MCE.emitByte(0x2E); break; + case X86::SS: MCE.emitByte(0x36); break; + case X86::DS: MCE.emitByte(0x3E); break; + case X86::ES: MCE.emitByte(0x26); break; + case X86::FS: MCE.emitByte(0x64); break; + case X86::GS: MCE.emitByte(0x65); break; + } + } + break; + case X86II::FS: + MCE.emitByte(0x64); + break; + case X86II::GS: + MCE.emitByte(0x65); + break; + } +} + +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags, + int MemOperand, + const MachineInstr &MI, + const MCInstrDesc *Desc) const { + bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; + bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3; + bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; + + // VEX_R: opcode externsion equivalent to REX.R in + // 1's complement (inverted) form + // + // 1: Same as REX_R=0 (must be 1 in 32-bit mode) + // 0: Same as REX_R=1 (64 bit mode only) + // + unsigned char VEX_R = 0x1; + + // VEX_X: equivalent to REX.X, only used when a + // register is used for index in SIB Byte. + // + // 1: Same as REX.X=0 (must be 1 in 32-bit mode) + // 0: Same as REX.X=1 (64-bit mode only) + unsigned char VEX_X = 0x1; + + // VEX_B: + // + // 1: Same as REX_B=0 (ignored in 32-bit mode) + // 0: Same as REX_B=1 (64 bit mode only) + // + unsigned char VEX_B = 0x1; + + // VEX_W: opcode specific (use like REX.W, or used for + // opcode extension, or ignored, depending on the opcode byte) + unsigned char VEX_W = 0; + + // XOP: Use XOP prefix byte 0x8f instead of VEX. + unsigned char XOP = 0; + + // VEX_5M (VEX m-mmmmm field): + // + // 0b00000: Reserved for future use + // 0b00001: implied 0F leading opcode + // 0b00010: implied 0F 38 leading opcode bytes + // 0b00011: implied 0F 3A leading opcode bytes + // 0b00100-0b11111: Reserved for future use + // 0b01000: XOP map select - 08h instructions with imm byte + // 0b10001: XOP map select - 09h instructions with no imm byte + unsigned char VEX_5M = 0x1; + + // VEX_4V (VEX vvvv field): a register specifier + // (in 1's complement form) or 1111 if unused. + unsigned char VEX_4V = 0xf; + + // VEX_L (Vector Length): + // + // 0: scalar or 128-bit vector + // 1: 256-bit vector + // + unsigned char VEX_L = 0; + + // VEX_PP: opcode extension providing equivalent + // functionality of a SIMD prefix + // + // 0b00: None + // 0b01: 66 + // 0b10: F3 + // 0b11: F2 + // + unsigned char VEX_PP = 0; + + // Encode the operand size opcode prefix as needed. + if (TSFlags & X86II::OpSize) + VEX_PP = 0x01; + + if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W) + VEX_W = 1; + + if ((TSFlags >> X86II::VEXShift) & X86II::XOP) + XOP = 1; + + if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) + VEX_L = 1; + + switch (TSFlags & X86II::Op0Mask) { + default: llvm_unreachable("Invalid prefix!"); + case X86II::T8: // 0F 38 + VEX_5M = 0x2; + break; + case X86II::TA: // 0F 3A + VEX_5M = 0x3; + break; + case X86II::T8XS: // F3 0F 38 + VEX_PP = 0x2; + VEX_5M = 0x2; + break; + case X86II::T8XD: // F2 0F 38 + VEX_PP = 0x3; + VEX_5M = 0x2; + break; + case X86II::TAXD: // F2 0F 3A + VEX_PP = 0x3; + VEX_5M = 0x3; + break; + case X86II::XS: // F3 0F + VEX_PP = 0x2; + break; + case X86II::XD: // F2 0F + VEX_PP = 0x3; + break; + case X86II::XOP8: + VEX_5M = 0x8; + break; + case X86II::XOP9: + VEX_5M = 0x9; + break; + case X86II::A6: // Bypass: Not used by VEX + case X86II::A7: // Bypass: Not used by VEX + case X86II::TB: // Bypass: Not used by VEX + case 0: + break; // No prefix! + } + + + // Classify VEX_B, VEX_4V, VEX_R, VEX_X + unsigned NumOps = Desc->getNumOperands(); + unsigned CurOp = 0; + if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) == 0) + ++CurOp; + else if (NumOps > 3 && Desc->getOperandConstraint(2, MCOI::TIED_TO) == 0) { + assert(Desc->getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1); + // Special case for GATHER with 2 TIED_TO operands + // Skip the first 2 operands: dst, mask_wb + CurOp += 2; + } + + switch (TSFlags & X86II::FormMask) { + case X86II::MRMInitReg: + // Duplicate register. + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_R = 0x0; + + if (HasVEX_4V) + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_B = 0x0; + if (HasVEX_4VOp3) + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + break; + case X86II::MRMDestMem: { + // MRMDestMem instructions forms: + // MemAddr, src1(ModR/M) + // MemAddr, src1(VEX_4V), src2(ModR/M) + // MemAddr, src1(ModR/M), imm8 + // + if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg())) + VEX_B = 0x0; + if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg())) + VEX_X = 0x0; + + CurOp = X86::AddrNumOperands; + if (HasVEX_4V) + VEX_4V = getVEXRegisterEncoding(MI, CurOp++); + + const MachineOperand &MO = MI.getOperand(CurOp); + if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg())) + VEX_R = 0x0; + break; + } + case X86II::MRMSrcMem: + // MRMSrcMem instructions forms: + // src1(ModR/M), MemAddr + // src1(ModR/M), src2(VEX_4V), MemAddr + // src1(ModR/M), MemAddr, imm8 + // src1(ModR/M), MemAddr, src2(VEX_I8IMM) + // + // FMA4: + // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) + // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M), + if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg())) + VEX_R = 0x0; + + if (HasVEX_4V) + VEX_4V = getVEXRegisterEncoding(MI, 1); + + if (X86II::isX86_64ExtendedReg( + MI.getOperand(MemOperand+X86::AddrBaseReg).getReg())) + VEX_B = 0x0; + if (X86II::isX86_64ExtendedReg( + MI.getOperand(MemOperand+X86::AddrIndexReg).getReg())) + VEX_X = 0x0; + + if (HasVEX_4VOp3) + VEX_4V = getVEXRegisterEncoding(MI, X86::AddrNumOperands+1); + break; + case X86II::MRM0m: case X86II::MRM1m: + case X86II::MRM2m: case X86II::MRM3m: + case X86II::MRM4m: case X86II::MRM5m: + case X86II::MRM6m: case X86II::MRM7m: { + // MRM[0-9]m instructions forms: + // MemAddr + // src1(VEX_4V), MemAddr + if (HasVEX_4V) + VEX_4V = getVEXRegisterEncoding(MI, 0); + + if (X86II::isX86_64ExtendedReg( + MI.getOperand(MemOperand+X86::AddrBaseReg).getReg())) + VEX_B = 0x0; + if (X86II::isX86_64ExtendedReg( + MI.getOperand(MemOperand+X86::AddrIndexReg).getReg())) + VEX_X = 0x0; + break; + } + case X86II::MRMSrcReg: + // MRMSrcReg instructions forms: + // dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) + // dst(ModR/M), src1(ModR/M) + // dst(ModR/M), src1(ModR/M), imm8 + // + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_R = 0x0; + CurOp++; + + if (HasVEX_4V) + VEX_4V = getVEXRegisterEncoding(MI, CurOp++); + + if (HasMemOp4) // Skip second register source (encoded in I8IMM) + CurOp++; + + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_B = 0x0; + CurOp++; + if (HasVEX_4VOp3) + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + break; + case X86II::MRMDestReg: + // MRMDestReg instructions forms: + // dst(ModR/M), src(ModR/M) + // dst(ModR/M), src(ModR/M), imm8 + // dst(ModR/M), src1(VEX_4V), src2(ModR/M) + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_B = 0x0; + CurOp++; + + if (HasVEX_4V) + VEX_4V = getVEXRegisterEncoding(MI, CurOp++); + + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_R = 0x0; + break; + case X86II::MRM0r: case X86II::MRM1r: + case X86II::MRM2r: case X86II::MRM3r: + case X86II::MRM4r: case X86II::MRM5r: + case X86II::MRM6r: case X86II::MRM7r: + // MRM0r-MRM7r instructions forms: + // dst(VEX_4V), src(ModR/M), imm8 + VEX_4V = getVEXRegisterEncoding(MI, 0); + if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg())) + VEX_B = 0x0; + break; + default: // RawFrm + break; + } + + // Emit segment override opcode prefix as needed. + emitSegmentOverridePrefix(TSFlags, MemOperand, MI); + + // VEX opcode prefix can have 2 or 3 bytes + // + // 3 bytes: + // +-----+ +--------------+ +-------------------+ + // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp | + // +-----+ +--------------+ +-------------------+ + // 2 bytes: + // +-----+ +-------------------+ + // | C5h | | R | vvvv | L | pp | + // +-----+ +-------------------+ + // + unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3); + + if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix + MCE.emitByte(0xC5); + MCE.emitByte(LastByte | (VEX_R << 7)); + return; + } + + // 3 byte VEX prefix + MCE.emitByte(XOP ? 0x8F : 0xC4); + MCE.emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M); + MCE.emitByte(LastByte | (VEX_W << 7)); +} + +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, + const MCInstrDesc *Desc) { + DEBUG(dbgs() << MI); + + // If this is a pseudo instruction, lower it. + switch (Desc->getOpcode()) { + case X86::ADD16rr_DB: Desc = UpdateOp(MI, II, X86::OR16rr); break; + case X86::ADD32rr_DB: Desc = UpdateOp(MI, II, X86::OR32rr); break; + case X86::ADD64rr_DB: Desc = UpdateOp(MI, II, X86::OR64rr); break; + case X86::ADD16ri_DB: Desc = UpdateOp(MI, II, X86::OR16ri); break; + case X86::ADD32ri_DB: Desc = UpdateOp(MI, II, X86::OR32ri); break; + case X86::ADD64ri32_DB: Desc = UpdateOp(MI, II, X86::OR64ri32); break; + case X86::ADD16ri8_DB: Desc = UpdateOp(MI, II, X86::OR16ri8); break; + case X86::ADD32ri8_DB: Desc = UpdateOp(MI, II, X86::OR32ri8); break; + case X86::ADD64ri8_DB: Desc = UpdateOp(MI, II, X86::OR64ri8); break; + case X86::ACQUIRE_MOV8rm: Desc = UpdateOp(MI, II, X86::MOV8rm); break; + case X86::ACQUIRE_MOV16rm: Desc = UpdateOp(MI, II, X86::MOV16rm); break; + case X86::ACQUIRE_MOV32rm: Desc = UpdateOp(MI, II, X86::MOV32rm); break; + case X86::ACQUIRE_MOV64rm: Desc = UpdateOp(MI, II, X86::MOV64rm); break; + case X86::RELEASE_MOV8mr: Desc = UpdateOp(MI, II, X86::MOV8mr); break; + case X86::RELEASE_MOV16mr: Desc = UpdateOp(MI, II, X86::MOV16mr); break; + case X86::RELEASE_MOV32mr: Desc = UpdateOp(MI, II, X86::MOV32mr); break; + case X86::RELEASE_MOV64mr: Desc = UpdateOp(MI, II, X86::MOV64mr); break; + } + + + MCE.processDebugLoc(MI.getDebugLoc(), true); + + unsigned Opcode = Desc->Opcode; + + // If this is a two-address instruction, skip one of the register operands. + unsigned NumOps = Desc->getNumOperands(); + unsigned CurOp = 0; + if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) == 0) + ++CurOp; + else if (NumOps > 3 && Desc->getOperandConstraint(2, MCOI::TIED_TO) == 0) { + assert(Desc->getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1); + // Special case for GATHER with 2 TIED_TO operands + // Skip the first 2 operands: dst, mask_wb + CurOp += 2; + } + + uint64_t TSFlags = Desc->TSFlags; + + // Is this instruction encoded using the AVX VEX prefix? + bool HasVEXPrefix = (TSFlags >> X86II::VEXShift) & X86II::VEX; + // It uses the VEX.VVVV field? + bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; + bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3; + bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; + const unsigned MemOp4_I8IMMOperand = 2; + + // Determine where the memory operand starts, if present. + int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode); + if (MemoryOperand != -1) MemoryOperand += CurOp; + + if (!HasVEXPrefix) + emitOpcodePrefix(TSFlags, MemoryOperand, MI, Desc); + else + emitVEXOpcodePrefix(TSFlags, MemoryOperand, MI, Desc); + + unsigned char BaseOpcode = X86II::getBaseOpcodeFor(Desc->TSFlags); + switch (TSFlags & X86II::FormMask) { + default: + llvm_unreachable("Unknown FormMask value in X86 MachineCodeEmitter!"); + case X86II::Pseudo: + // Remember the current PC offset, this is the PIC relocation + // base address. + switch (Opcode) { + default: + llvm_unreachable("pseudo instructions should be removed before code" + " emission"); + // Do nothing for Int_MemBarrier - it's just a comment. Add a debug + // to make it slightly easier to see. + case X86::Int_MemBarrier: + DEBUG(dbgs() << "#MEMBARRIER\n"); + break; + + case TargetOpcode::INLINEASM: + // We allow inline assembler nodes with empty bodies - they can + // implicitly define registers, which is ok for JIT. + if (MI.getOperand(0).getSymbolName()[0]) + report_fatal_error("JIT does not support inline asm!"); + break; + case TargetOpcode::PROLOG_LABEL: + case TargetOpcode::GC_LABEL: + case TargetOpcode::EH_LABEL: + MCE.emitLabel(MI.getOperand(0).getMCSymbol()); + break; + + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::KILL: + break; + case X86::MOVPC32r: { + // This emits the "call" portion of this pseudo instruction. + MCE.emitByte(BaseOpcode); + emitConstant(0, X86II::getSizeOfImm(Desc->TSFlags)); + // Remember PIC base. + PICBaseOffset = (intptr_t) MCE.getCurrentPCOffset(); + X86JITInfo *JTI = TM.getJITInfo(); + JTI->setPICBase(MCE.getCurrentPCValue()); + break; + } + } + CurOp = NumOps; + break; + case X86II::RawFrm: { + MCE.emitByte(BaseOpcode); + + if (CurOp == NumOps) + break; + + const MachineOperand &MO = MI.getOperand(CurOp++); + + DEBUG(dbgs() << "RawFrm CurOp " << CurOp << "\n"); + DEBUG(dbgs() << "isMBB " << MO.isMBB() << "\n"); + DEBUG(dbgs() << "isGlobal " << MO.isGlobal() << "\n"); + DEBUG(dbgs() << "isSymbol " << MO.isSymbol() << "\n"); + DEBUG(dbgs() << "isImm " << MO.isImm() << "\n"); + + if (MO.isMBB()) { + emitPCRelativeBlockAddress(MO.getMBB()); + break; + } + + if (MO.isGlobal()) { + emitGlobalAddress(MO.getGlobal(), X86::reloc_pcrel_word, + MO.getOffset(), 0); + break; + } + + if (MO.isSymbol()) { + emitExternalSymbolAddress(MO.getSymbolName(), X86::reloc_pcrel_word); + break; + } + + // FIXME: Only used by hackish MCCodeEmitter, remove when dead. + if (MO.isJTI()) { + emitJumpTableAddress(MO.getIndex(), X86::reloc_pcrel_word); + break; + } + + assert(MO.isImm() && "Unknown RawFrm operand!"); + if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32) { + // Fix up immediate operand for pc relative calls. + intptr_t Imm = (intptr_t)MO.getImm(); + Imm = Imm - MCE.getCurrentPCValue() - 4; + emitConstant(Imm, X86II::getSizeOfImm(Desc->TSFlags)); + } else + emitConstant(MO.getImm(), X86II::getSizeOfImm(Desc->TSFlags)); + break; + } + + case X86II::AddRegFrm: { + MCE.emitByte(BaseOpcode + + getX86RegNum(MI.getOperand(CurOp++).getReg())); + + if (CurOp == NumOps) + break; + + const MachineOperand &MO1 = MI.getOperand(CurOp++); + unsigned Size = X86II::getSizeOfImm(Desc->TSFlags); + if (MO1.isImm()) { + emitConstant(MO1.getImm(), Size); + break; + } + + unsigned rt = Is64BitMode ? X86::reloc_pcrel_word + : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word); + if (Opcode == X86::MOV64ri64i32) + rt = X86::reloc_absolute_word; // FIXME: add X86II flag? + // This should not occur on Darwin for relocatable objects. + if (Opcode == X86::MOV64ri) + rt = X86::reloc_absolute_dword; // FIXME: add X86II flag? + if (MO1.isGlobal()) { + bool Indirect = gvNeedsNonLazyPtr(MO1, TM); + emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0, + Indirect); + } else if (MO1.isSymbol()) + emitExternalSymbolAddress(MO1.getSymbolName(), rt); + else if (MO1.isCPI()) + emitConstPoolAddress(MO1.getIndex(), rt); + else if (MO1.isJTI()) + emitJumpTableAddress(MO1.getIndex(), rt); + break; + } + + case X86II::MRMDestReg: { + MCE.emitByte(BaseOpcode); + + unsigned SrcRegNum = CurOp+1; + if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) + SrcRegNum++; + + emitRegModRMByte(MI.getOperand(CurOp).getReg(), + getX86RegNum(MI.getOperand(SrcRegNum).getReg())); + CurOp = SrcRegNum + 1; + break; + } + case X86II::MRMDestMem: { + MCE.emitByte(BaseOpcode); + + unsigned SrcRegNum = CurOp + X86::AddrNumOperands; + if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) + SrcRegNum++; + emitMemModRMByte(MI, CurOp, + getX86RegNum(MI.getOperand(SrcRegNum).getReg())); + CurOp = SrcRegNum + 1; + break; + } + + case X86II::MRMSrcReg: { + MCE.emitByte(BaseOpcode); + + unsigned SrcRegNum = CurOp+1; + if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) + ++SrcRegNum; + + if (HasMemOp4) // Skip 2nd src (which is encoded in I8IMM) + ++SrcRegNum; + + emitRegModRMByte(MI.getOperand(SrcRegNum).getReg(), + getX86RegNum(MI.getOperand(CurOp).getReg())); + // 2 operands skipped with HasMemOp4, compensate accordingly + CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1; + if (HasVEX_4VOp3) + ++CurOp; + break; + } + case X86II::MRMSrcMem: { + int AddrOperands = X86::AddrNumOperands; + unsigned FirstMemOp = CurOp+1; + if (HasVEX_4V) { + ++AddrOperands; + ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV). + } + if (HasMemOp4) // Skip second register source (encoded in I8IMM) + ++FirstMemOp; + + MCE.emitByte(BaseOpcode); + + intptr_t PCAdj = (CurOp + AddrOperands + 1 != NumOps) ? + X86II::getSizeOfImm(Desc->TSFlags) : 0; + emitMemModRMByte(MI, FirstMemOp, + getX86RegNum(MI.getOperand(CurOp).getReg()),PCAdj); + CurOp += AddrOperands + 1; + if (HasVEX_4VOp3) + ++CurOp; + break; + } + + case X86II::MRM0r: case X86II::MRM1r: + case X86II::MRM2r: case X86II::MRM3r: + case X86II::MRM4r: case X86II::MRM5r: + case X86II::MRM6r: case X86II::MRM7r: { + if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV). + ++CurOp; + MCE.emitByte(BaseOpcode); + emitRegModRMByte(MI.getOperand(CurOp++).getReg(), + (Desc->TSFlags & X86II::FormMask)-X86II::MRM0r); + + if (CurOp == NumOps) + break; + + const MachineOperand &MO1 = MI.getOperand(CurOp++); + unsigned Size = X86II::getSizeOfImm(Desc->TSFlags); + if (MO1.isImm()) { + emitConstant(MO1.getImm(), Size); + break; + } + + unsigned rt = Is64BitMode ? X86::reloc_pcrel_word + : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word); + if (Opcode == X86::MOV64ri32) + rt = X86::reloc_absolute_word_sext; // FIXME: add X86II flag? + if (MO1.isGlobal()) { + bool Indirect = gvNeedsNonLazyPtr(MO1, TM); + emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0, + Indirect); + } else if (MO1.isSymbol()) + emitExternalSymbolAddress(MO1.getSymbolName(), rt); + else if (MO1.isCPI()) + emitConstPoolAddress(MO1.getIndex(), rt); + else if (MO1.isJTI()) + emitJumpTableAddress(MO1.getIndex(), rt); + break; + } + + case X86II::MRM0m: case X86II::MRM1m: + case X86II::MRM2m: case X86II::MRM3m: + case X86II::MRM4m: case X86II::MRM5m: + case X86II::MRM6m: case X86II::MRM7m: { + if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV). + ++CurOp; + intptr_t PCAdj = (CurOp + X86::AddrNumOperands != NumOps) ? + (MI.getOperand(CurOp+X86::AddrNumOperands).isImm() ? + X86II::getSizeOfImm(Desc->TSFlags) : 4) : 0; + + MCE.emitByte(BaseOpcode); + emitMemModRMByte(MI, CurOp, (Desc->TSFlags & X86II::FormMask)-X86II::MRM0m, + PCAdj); + CurOp += X86::AddrNumOperands; + + if (CurOp == NumOps) + break; + + const MachineOperand &MO = MI.getOperand(CurOp++); + unsigned Size = X86II::getSizeOfImm(Desc->TSFlags); + if (MO.isImm()) { + emitConstant(MO.getImm(), Size); + break; + } + + unsigned rt = Is64BitMode ? X86::reloc_pcrel_word + : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word); + if (Opcode == X86::MOV64mi32) + rt = X86::reloc_absolute_word_sext; // FIXME: add X86II flag? + if (MO.isGlobal()) { + bool Indirect = gvNeedsNonLazyPtr(MO, TM); + emitGlobalAddress(MO.getGlobal(), rt, MO.getOffset(), 0, + Indirect); + } else if (MO.isSymbol()) + emitExternalSymbolAddress(MO.getSymbolName(), rt); + else if (MO.isCPI()) + emitConstPoolAddress(MO.getIndex(), rt); + else if (MO.isJTI()) + emitJumpTableAddress(MO.getIndex(), rt); + break; + } + + case X86II::MRMInitReg: + MCE.emitByte(BaseOpcode); + // Duplicate register, used by things like MOV8r0 (aka xor reg,reg). + emitRegModRMByte(MI.getOperand(CurOp).getReg(), + getX86RegNum(MI.getOperand(CurOp).getReg())); + ++CurOp; + break; + + case X86II::MRM_C1: + MCE.emitByte(BaseOpcode); + MCE.emitByte(0xC1); + break; + case X86II::MRM_C8: + MCE.emitByte(BaseOpcode); + MCE.emitByte(0xC8); + break; + case X86II::MRM_C9: + MCE.emitByte(BaseOpcode); + MCE.emitByte(0xC9); + break; + case X86II::MRM_CA: + MCE.emitByte(BaseOpcode); + MCE.emitByte(0xCA); + break; + case X86II::MRM_CB: + MCE.emitByte(BaseOpcode); + MCE.emitByte(0xCB); + break; + case X86II::MRM_E8: + MCE.emitByte(BaseOpcode); + MCE.emitByte(0xE8); + break; + case X86II::MRM_F0: + MCE.emitByte(BaseOpcode); + MCE.emitByte(0xF0); + break; + } + + while (CurOp != NumOps && NumOps - CurOp <= 2) { + // The last source register of a 4 operand instruction in AVX is encoded + // in bits[7:4] of a immediate byte. + if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) { + const MachineOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand + : CurOp); + ++CurOp; + unsigned RegNum = getX86RegNum(MO.getReg()) << 4; + if (X86II::isX86_64ExtendedReg(MO.getReg())) + RegNum |= 1 << 7; + // If there is an additional 5th operand it must be an immediate, which + // is encoded in bits[3:0] + if (CurOp != NumOps) { + const MachineOperand &MIMM = MI.getOperand(CurOp++); + if (MIMM.isImm()) { + unsigned Val = MIMM.getImm(); + assert(Val < 16 && "Immediate operand value out of range"); + RegNum |= Val; + } + } + emitConstant(RegNum, 1); + } else { + emitConstant(MI.getOperand(CurOp++).getImm(), + X86II::getSizeOfImm(Desc->TSFlags)); + } + } + + if (!MI.isVariadic() && CurOp != NumOps) { +#ifndef NDEBUG + dbgs() << "Cannot encode all operands of: " << MI << "\n"; +#endif + llvm_unreachable(0); + } + + MCE.processDebugLoc(MI.getDebugLoc(), false); +} diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp new file mode 100644 index 0000000..cf44bd0 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp @@ -0,0 +1,2427 @@ +//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the X86-specific support for the FastISel class. Much +// of the target-specific code is generated by tablegen in the file +// X86GenFastISel.inc, which is #included here. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86ISelLowering.h" +#include "X86InstrBuilder.h" +#include "X86RegisterInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/FastISel.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Operator.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +namespace { + +class X86FastISel : public FastISel { + /// Subtarget - Keep a pointer to the X86Subtarget around so that we can + /// make the right decision when generating code for different targets. + const X86Subtarget *Subtarget; + + /// RegInfo - X86 register info. + /// + const X86RegisterInfo *RegInfo; + + /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 + /// floating point ops. + /// When SSE is available, use it for f32 operations. + /// When SSE2 is available, use it for f64 operations. + bool X86ScalarSSEf64; + bool X86ScalarSSEf32; + +public: + explicit X86FastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) + : FastISel(funcInfo, libInfo) { + Subtarget = &TM.getSubtarget<X86Subtarget>(); + X86ScalarSSEf64 = Subtarget->hasSSE2(); + X86ScalarSSEf32 = Subtarget->hasSSE1(); + RegInfo = static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); + } + + virtual bool TargetSelectInstruction(const Instruction *I); + + /// \brief The specified machine instr operand is a vreg, and that + /// vreg is being provided by the specified load instruction. If possible, + /// try to fold the load as an operand to the instruction, returning true if + /// possible. + virtual bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, + const LoadInst *LI); + + virtual bool FastLowerArguments(); + +#include "X86GenFastISel.inc" + +private: + bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT); + + bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR); + + bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM); + bool X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM); + + bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, + unsigned &ResultReg); + + bool X86SelectAddress(const Value *V, X86AddressMode &AM); + bool X86SelectCallAddress(const Value *V, X86AddressMode &AM); + + bool X86SelectLoad(const Instruction *I); + + bool X86SelectStore(const Instruction *I); + + bool X86SelectRet(const Instruction *I); + + bool X86SelectCmp(const Instruction *I); + + bool X86SelectZExt(const Instruction *I); + + bool X86SelectBranch(const Instruction *I); + + bool X86SelectShift(const Instruction *I); + + bool X86SelectDivRem(const Instruction *I); + + bool X86SelectSelect(const Instruction *I); + + bool X86SelectTrunc(const Instruction *I); + + bool X86SelectFPExt(const Instruction *I); + bool X86SelectFPTrunc(const Instruction *I); + + bool X86VisitIntrinsicCall(const IntrinsicInst &I); + bool X86SelectCall(const Instruction *I); + + bool DoSelectCall(const Instruction *I, const char *MemIntName); + + const X86InstrInfo *getInstrInfo() const { + return getTargetMachine()->getInstrInfo(); + } + const X86TargetMachine *getTargetMachine() const { + return static_cast<const X86TargetMachine *>(&TM); + } + + unsigned TargetMaterializeConstant(const Constant *C); + + unsigned TargetMaterializeAlloca(const AllocaInst *C); + + unsigned TargetMaterializeFloatZero(const ConstantFP *CF); + + /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is + /// computed in an SSE register, not on the X87 floating point stack. + bool isScalarFPTypeInSSEReg(EVT VT) const { + return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 + (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 + } + + bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false); + + bool IsMemcpySmall(uint64_t Len); + + bool TryEmitSmallMemcpy(X86AddressMode DestAM, + X86AddressMode SrcAM, uint64_t Len); +}; + +} // end anonymous namespace. + +bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { + EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true); + if (evt == MVT::Other || !evt.isSimple()) + // Unhandled type. Halt "fast" selection and bail. + return false; + + VT = evt.getSimpleVT(); + // For now, require SSE/SSE2 for performing floating-point operations, + // since x87 requires additional work. + if (VT == MVT::f64 && !X86ScalarSSEf64) + return false; + if (VT == MVT::f32 && !X86ScalarSSEf32) + return false; + // Similarly, no f80 support yet. + if (VT == MVT::f80) + return false; + // We only handle legal types. For example, on x86-32 the instruction + // selector contains all of the 64-bit instructions from x86-64, + // under the assumption that i64 won't be used if the target doesn't + // support it. + return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT); +} + +#include "X86GenCallingConv.inc" + +/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT. +/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV. +/// Return true and the result register by reference if it is possible. +bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, + unsigned &ResultReg) { + // Get opcode and regclass of the output for the given load instruction. + unsigned Opc = 0; + const TargetRegisterClass *RC = NULL; + switch (VT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i1: + case MVT::i8: + Opc = X86::MOV8rm; + RC = &X86::GR8RegClass; + break; + case MVT::i16: + Opc = X86::MOV16rm; + RC = &X86::GR16RegClass; + break; + case MVT::i32: + Opc = X86::MOV32rm; + RC = &X86::GR32RegClass; + break; + case MVT::i64: + // Must be in x86-64 mode. + Opc = X86::MOV64rm; + RC = &X86::GR64RegClass; + break; + case MVT::f32: + if (X86ScalarSSEf32) { + Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm; + RC = &X86::FR32RegClass; + } else { + Opc = X86::LD_Fp32m; + RC = &X86::RFP32RegClass; + } + break; + case MVT::f64: + if (X86ScalarSSEf64) { + Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm; + RC = &X86::FR64RegClass; + } else { + Opc = X86::LD_Fp64m; + RC = &X86::RFP64RegClass; + } + break; + case MVT::f80: + // No f80 support yet. + return false; + } + + ResultReg = createResultReg(RC); + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, + DL, TII.get(Opc), ResultReg), AM); + return true; +} + +/// X86FastEmitStore - Emit a machine instruction to store a value Val of +/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr +/// and a displacement offset, or a GlobalAddress, +/// i.e. V. Return true if it is possible. +bool +X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) { + // Get opcode and regclass of the output for the given store instruction. + unsigned Opc = 0; + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f80: // No f80 support yet. + default: return false; + case MVT::i1: { + // Mask out all but lowest bit. + unsigned AndResult = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(X86::AND8ri), AndResult).addReg(Val).addImm(1); + Val = AndResult; + } + // FALLTHROUGH, handling i1 as i8. + case MVT::i8: Opc = X86::MOV8mr; break; + case MVT::i16: Opc = X86::MOV16mr; break; + case MVT::i32: Opc = X86::MOV32mr; break; + case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode. + case MVT::f32: + Opc = X86ScalarSSEf32 ? + (Subtarget->hasAVX() ? X86::VMOVSSmr : X86::MOVSSmr) : X86::ST_Fp32m; + break; + case MVT::f64: + Opc = X86ScalarSSEf64 ? + (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m; + break; + case MVT::v4f32: + Opc = X86::MOVAPSmr; + break; + case MVT::v2f64: + Opc = X86::MOVAPDmr; + break; + case MVT::v4i32: + case MVT::v2i64: + case MVT::v8i16: + case MVT::v16i8: + Opc = X86::MOVDQAmr; + break; + } + + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, + DL, TII.get(Opc)), AM).addReg(Val); + return true; +} + +bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, + const X86AddressMode &AM) { + // Handle 'null' like i32/i64 0. + if (isa<ConstantPointerNull>(Val)) + Val = Constant::getNullValue(TD.getIntPtrType(Val->getContext())); + + // If this is a store of a simple constant, fold the constant into the store. + if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) { + unsigned Opc = 0; + bool Signed = true; + switch (VT.getSimpleVT().SimpleTy) { + default: break; + case MVT::i1: Signed = false; // FALLTHROUGH to handle as i8. + case MVT::i8: Opc = X86::MOV8mi; break; + case MVT::i16: Opc = X86::MOV16mi; break; + case MVT::i32: Opc = X86::MOV32mi; break; + case MVT::i64: + // Must be a 32-bit sign extended value. + if (isInt<32>(CI->getSExtValue())) + Opc = X86::MOV64mi32; + break; + } + + if (Opc) { + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, + DL, TII.get(Opc)), AM) + .addImm(Signed ? (uint64_t) CI->getSExtValue() : + CI->getZExtValue()); + return true; + } + } + + unsigned ValReg = getRegForValue(Val); + if (ValReg == 0) + return false; + + return X86FastEmitStore(VT, ValReg, AM); +} + +/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of +/// type SrcVT to type DstVT using the specified extension opcode Opc (e.g. +/// ISD::SIGN_EXTEND). +bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, + unsigned Src, EVT SrcVT, + unsigned &ResultReg) { + unsigned RR = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, + Src, /*TODO: Kill=*/false); + if (RR == 0) + return false; + + ResultReg = RR; + return true; +} + +/// X86SelectAddress - Attempt to fill in an address from the given value. +/// +bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { + const User *U = NULL; + unsigned Opcode = Instruction::UserOp1; + if (const Instruction *I = dyn_cast<Instruction>(V)) { + // Don't walk into other basic blocks; it's possible we haven't + // visited them yet, so the instructions may not yet be assigned + // virtual registers. + if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) || + FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) { + Opcode = I->getOpcode(); + U = I; + } + } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) { + Opcode = C->getOpcode(); + U = C; + } + + if (PointerType *Ty = dyn_cast<PointerType>(V->getType())) + if (Ty->getAddressSpace() > 255) + // Fast instruction selection doesn't support the special + // address spaces. + return false; + + switch (Opcode) { + default: break; + case Instruction::BitCast: + // Look past bitcasts. + return X86SelectAddress(U->getOperand(0), AM); + + case Instruction::IntToPtr: + // Look past no-op inttoptrs. + if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) + return X86SelectAddress(U->getOperand(0), AM); + break; + + case Instruction::PtrToInt: + // Look past no-op ptrtoints. + if (TLI.getValueType(U->getType()) == TLI.getPointerTy()) + return X86SelectAddress(U->getOperand(0), AM); + break; + + case Instruction::Alloca: { + // Do static allocas. + const AllocaInst *A = cast<AllocaInst>(V); + DenseMap<const AllocaInst*, int>::iterator SI = + FuncInfo.StaticAllocaMap.find(A); + if (SI != FuncInfo.StaticAllocaMap.end()) { + AM.BaseType = X86AddressMode::FrameIndexBase; + AM.Base.FrameIndex = SI->second; + return true; + } + break; + } + + case Instruction::Add: { + // Adds of constants are common and easy enough. + if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) { + uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue(); + // They have to fit in the 32-bit signed displacement field though. + if (isInt<32>(Disp)) { + AM.Disp = (uint32_t)Disp; + return X86SelectAddress(U->getOperand(0), AM); + } + } + break; + } + + case Instruction::GetElementPtr: { + X86AddressMode SavedAM = AM; + + // Pattern-match simple GEPs. + uint64_t Disp = (int32_t)AM.Disp; + unsigned IndexReg = AM.IndexReg; + unsigned Scale = AM.Scale; + gep_type_iterator GTI = gep_type_begin(U); + // Iterate through the indices, folding what we can. Constants can be + // folded, and one dynamic index can be handled, if the scale is supported. + for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); + i != e; ++i, ++GTI) { + const Value *Op = *i; + if (StructType *STy = dyn_cast<StructType>(*GTI)) { + const StructLayout *SL = TD.getStructLayout(STy); + Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue()); + continue; + } + + // A array/variable index is always of the form i*S where S is the + // constant scale size. See if we can push the scale into immediates. + uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType()); + for (;;) { + if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { + // Constant-offset addressing. + Disp += CI->getSExtValue() * S; + break; + } + if (isa<AddOperator>(Op) && + (!isa<Instruction>(Op) || + FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()] + == FuncInfo.MBB) && + isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) { + // An add (in the same block) with a constant operand. Fold the + // constant. + ConstantInt *CI = + cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1)); + Disp += CI->getSExtValue() * S; + // Iterate on the other operand. + Op = cast<AddOperator>(Op)->getOperand(0); + continue; + } + if (IndexReg == 0 && + (!AM.GV || !Subtarget->isPICStyleRIPRel()) && + (S == 1 || S == 2 || S == 4 || S == 8)) { + // Scaled-index addressing. + Scale = S; + IndexReg = getRegForGEPIndex(Op).first; + if (IndexReg == 0) + return false; + break; + } + // Unsupported. + goto unsupported_gep; + } + } + // Check for displacement overflow. + if (!isInt<32>(Disp)) + break; + // Ok, the GEP indices were covered by constant-offset and scaled-index + // addressing. Update the address state and move on to examining the base. + AM.IndexReg = IndexReg; + AM.Scale = Scale; + AM.Disp = (uint32_t)Disp; + if (X86SelectAddress(U->getOperand(0), AM)) + return true; + + // If we couldn't merge the gep value into this addr mode, revert back to + // our address and just match the value instead of completely failing. + AM = SavedAM; + break; + unsupported_gep: + // Ok, the GEP indices weren't all covered. + break; + } + } + + // Handle constant address. + if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) { + // Can't handle alternate code models yet. + if (TM.getCodeModel() != CodeModel::Small) + return false; + + // Can't handle TLS yet. + if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) + if (GVar->isThreadLocal()) + return false; + + // Can't handle TLS yet, part 2 (this is slightly crazy, but this is how + // it works...). + if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) + if (const GlobalVariable *GVar = + dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false))) + if (GVar->isThreadLocal()) + return false; + + // RIP-relative addresses can't have additional register operands, so if + // we've already folded stuff into the addressing mode, just force the + // global value into its own register, which we can use as the basereg. + if (!Subtarget->isPICStyleRIPRel() || + (AM.Base.Reg == 0 && AM.IndexReg == 0)) { + // Okay, we've committed to selecting this global. Set up the address. + AM.GV = GV; + + // Allow the subtarget to classify the global. + unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM); + + // If this reference is relative to the pic base, set it now. + if (isGlobalRelativeToPICBase(GVFlags)) { + // FIXME: How do we know Base.Reg is free?? + AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); + } + + // Unless the ABI requires an extra load, return a direct reference to + // the global. + if (!isGlobalStubReference(GVFlags)) { + if (Subtarget->isPICStyleRIPRel()) { + // Use rip-relative addressing if we can. Above we verified that the + // base and index registers are unused. + assert(AM.Base.Reg == 0 && AM.IndexReg == 0); + AM.Base.Reg = X86::RIP; + } + AM.GVOpFlags = GVFlags; + return true; + } + + // Ok, we need to do a load from a stub. If we've already loaded from + // this stub, reuse the loaded pointer, otherwise emit the load now. + DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V); + unsigned LoadReg; + if (I != LocalValueMap.end() && I->second != 0) { + LoadReg = I->second; + } else { + // Issue load from stub. + unsigned Opc = 0; + const TargetRegisterClass *RC = NULL; + X86AddressMode StubAM; + StubAM.Base.Reg = AM.Base.Reg; + StubAM.GV = GV; + StubAM.GVOpFlags = GVFlags; + + // Prepare for inserting code in the local-value area. + SavePoint SaveInsertPt = enterLocalValueArea(); + + if (TLI.getPointerTy() == MVT::i64) { + Opc = X86::MOV64rm; + RC = &X86::GR64RegClass; + + if (Subtarget->isPICStyleRIPRel()) + StubAM.Base.Reg = X86::RIP; + } else { + Opc = X86::MOV32rm; + RC = &X86::GR32RegClass; + } + + LoadReg = createResultReg(RC); + MachineInstrBuilder LoadMI = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg); + addFullAddress(LoadMI, StubAM); + + // Ok, back to normal mode. + leaveLocalValueArea(SaveInsertPt); + + // Prevent loading GV stub multiple times in same MBB. + LocalValueMap[V] = LoadReg; + } + + // Now construct the final address. Note that the Disp, Scale, + // and Index values may already be set here. + AM.Base.Reg = LoadReg; + AM.GV = 0; + return true; + } + } + + // If all else fails, try to materialize the value in a register. + if (!AM.GV || !Subtarget->isPICStyleRIPRel()) { + if (AM.Base.Reg == 0) { + AM.Base.Reg = getRegForValue(V); + return AM.Base.Reg != 0; + } + if (AM.IndexReg == 0) { + assert(AM.Scale == 1 && "Scale with no index!"); + AM.IndexReg = getRegForValue(V); + return AM.IndexReg != 0; + } + } + + return false; +} + +/// X86SelectCallAddress - Attempt to fill in an address from the given value. +/// +bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) { + const User *U = NULL; + unsigned Opcode = Instruction::UserOp1; + if (const Instruction *I = dyn_cast<Instruction>(V)) { + Opcode = I->getOpcode(); + U = I; + } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) { + Opcode = C->getOpcode(); + U = C; + } + + switch (Opcode) { + default: break; + case Instruction::BitCast: + // Look past bitcasts. + return X86SelectCallAddress(U->getOperand(0), AM); + + case Instruction::IntToPtr: + // Look past no-op inttoptrs. + if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) + return X86SelectCallAddress(U->getOperand(0), AM); + break; + + case Instruction::PtrToInt: + // Look past no-op ptrtoints. + if (TLI.getValueType(U->getType()) == TLI.getPointerTy()) + return X86SelectCallAddress(U->getOperand(0), AM); + break; + } + + // Handle constant address. + if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) { + // Can't handle alternate code models yet. + if (TM.getCodeModel() != CodeModel::Small) + return false; + + // RIP-relative addresses can't have additional register operands. + if (Subtarget->isPICStyleRIPRel() && + (AM.Base.Reg != 0 || AM.IndexReg != 0)) + return false; + + // Can't handle DLLImport. + if (GV->hasDLLImportLinkage()) + return false; + + // Can't handle TLS. + if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) + if (GVar->isThreadLocal()) + return false; + + // Okay, we've committed to selecting this global. Set up the basic address. + AM.GV = GV; + + // No ABI requires an extra load for anything other than DLLImport, which + // we rejected above. Return a direct reference to the global. + if (Subtarget->isPICStyleRIPRel()) { + // Use rip-relative addressing if we can. Above we verified that the + // base and index registers are unused. + assert(AM.Base.Reg == 0 && AM.IndexReg == 0); + AM.Base.Reg = X86::RIP; + } else if (Subtarget->isPICStyleStubPIC()) { + AM.GVOpFlags = X86II::MO_PIC_BASE_OFFSET; + } else if (Subtarget->isPICStyleGOT()) { + AM.GVOpFlags = X86II::MO_GOTOFF; + } + + return true; + } + + // If all else fails, try to materialize the value in a register. + if (!AM.GV || !Subtarget->isPICStyleRIPRel()) { + if (AM.Base.Reg == 0) { + AM.Base.Reg = getRegForValue(V); + return AM.Base.Reg != 0; + } + if (AM.IndexReg == 0) { + assert(AM.Scale == 1 && "Scale with no index!"); + AM.IndexReg = getRegForValue(V); + return AM.IndexReg != 0; + } + } + + return false; +} + + +/// X86SelectStore - Select and emit code to implement store instructions. +bool X86FastISel::X86SelectStore(const Instruction *I) { + // Atomic stores need special handling. + const StoreInst *S = cast<StoreInst>(I); + + if (S->isAtomic()) + return false; + + MVT VT; + if (!isTypeLegal(I->getOperand(0)->getType(), VT, /*AllowI1=*/true)) + return false; + + X86AddressMode AM; + if (!X86SelectAddress(I->getOperand(1), AM)) + return false; + + return X86FastEmitStore(VT, I->getOperand(0), AM); +} + +/// X86SelectRet - Select and emit code to implement ret instructions. +bool X86FastISel::X86SelectRet(const Instruction *I) { + const ReturnInst *Ret = cast<ReturnInst>(I); + const Function &F = *I->getParent()->getParent(); + const X86MachineFunctionInfo *X86MFInfo = + FuncInfo.MF->getInfo<X86MachineFunctionInfo>(); + + if (!FuncInfo.CanLowerReturn) + return false; + + CallingConv::ID CC = F.getCallingConv(); + if (CC != CallingConv::C && + CC != CallingConv::Fast && + CC != CallingConv::X86_FastCall) + return false; + + if (Subtarget->isTargetWin64()) + return false; + + // Don't handle popping bytes on return for now. + if (X86MFInfo->getBytesToPopOnReturn() != 0) + return false; + + // fastcc with -tailcallopt is intended to provide a guaranteed + // tail call optimization. Fastisel doesn't know how to do that. + if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) + return false; + + // Let SDISel handle vararg functions. + if (F.isVarArg()) + return false; + + // Build a list of return value registers. + SmallVector<unsigned, 4> RetRegs; + + if (Ret->getNumOperands() > 0) { + SmallVector<ISD::OutputArg, 4> Outs; + GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ValLocs; + CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs, + I->getContext()); + CCInfo.AnalyzeReturn(Outs, RetCC_X86); + + const Value *RV = Ret->getOperand(0); + unsigned Reg = getRegForValue(RV); + if (Reg == 0) + return false; + + // Only handle a single return value for now. + if (ValLocs.size() != 1) + return false; + + CCValAssign &VA = ValLocs[0]; + + // Don't bother handling odd stuff for now. + if (VA.getLocInfo() != CCValAssign::Full) + return false; + // Only handle register returns for now. + if (!VA.isRegLoc()) + return false; + + // The calling-convention tables for x87 returns don't tell + // the whole story. + if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) + return false; + + unsigned SrcReg = Reg + VA.getValNo(); + EVT SrcVT = TLI.getValueType(RV->getType()); + EVT DstVT = VA.getValVT(); + // Special handling for extended integers. + if (SrcVT != DstVT) { + if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16) + return false; + + if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt()) + return false; + + assert(DstVT == MVT::i32 && "X86 should always ext to i32"); + + if (SrcVT == MVT::i1) { + if (Outs[0].Flags.isSExt()) + return false; + SrcReg = FastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false); + SrcVT = MVT::i8; + } + unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND : + ISD::SIGN_EXTEND; + SrcReg = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, + SrcReg, /*TODO: Kill=*/false); + } + + // Make the copy. + unsigned DstReg = VA.getLocReg(); + const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg); + // Avoid a cross-class copy. This is very unlikely. + if (!SrcRC->contains(DstReg)) + return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + DstReg).addReg(SrcReg); + + // Add register to return instruction. + RetRegs.push_back(VA.getLocReg()); + } + + // The x86-64 ABI for returning structs by value requires that we copy + // the sret argument into %rax for the return. We saved the argument into + // a virtual register in the entry block, so now we copy the value out + // and into %rax. We also do the same with %eax for Win32. + if (F.hasStructRetAttr() && + (Subtarget->is64Bit() || Subtarget->isTargetWindows())) { + unsigned Reg = X86MFInfo->getSRetReturnReg(); + assert(Reg && + "SRetReturnReg should have been set in LowerFormalArguments()!"); + unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + RetReg).addReg(Reg); + RetRegs.push_back(RetReg); + } + + // Now emit the RET. + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::RET)); + for (unsigned i = 0, e = RetRegs.size(); i != e; ++i) + MIB.addReg(RetRegs[i], RegState::Implicit); + return true; +} + +/// X86SelectLoad - Select and emit code to implement load instructions. +/// +bool X86FastISel::X86SelectLoad(const Instruction *I) { + // Atomic loads need special handling. + if (cast<LoadInst>(I)->isAtomic()) + return false; + + MVT VT; + if (!isTypeLegal(I->getType(), VT, /*AllowI1=*/true)) + return false; + + X86AddressMode AM; + if (!X86SelectAddress(I->getOperand(0), AM)) + return false; + + unsigned ResultReg = 0; + if (X86FastEmitLoad(VT, AM, ResultReg)) { + UpdateValueMap(I, ResultReg); + return true; + } + return false; +} + +static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) { + bool HasAVX = Subtarget->hasAVX(); + bool X86ScalarSSEf32 = Subtarget->hasSSE1(); + bool X86ScalarSSEf64 = Subtarget->hasSSE2(); + + switch (VT.getSimpleVT().SimpleTy) { + default: return 0; + case MVT::i8: return X86::CMP8rr; + case MVT::i16: return X86::CMP16rr; + case MVT::i32: return X86::CMP32rr; + case MVT::i64: return X86::CMP64rr; + case MVT::f32: + return X86ScalarSSEf32 ? (HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) : 0; + case MVT::f64: + return X86ScalarSSEf64 ? (HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) : 0; + } +} + +/// X86ChooseCmpImmediateOpcode - If we have a comparison with RHS as the RHS +/// of the comparison, return an opcode that works for the compare (e.g. +/// CMP32ri) otherwise return 0. +static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) { + switch (VT.getSimpleVT().SimpleTy) { + // Otherwise, we can't fold the immediate into this comparison. + default: return 0; + case MVT::i8: return X86::CMP8ri; + case MVT::i16: return X86::CMP16ri; + case MVT::i32: return X86::CMP32ri; + case MVT::i64: + // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext + // field. + if ((int)RHSC->getSExtValue() == RHSC->getSExtValue()) + return X86::CMP64ri32; + return 0; + } +} + +bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, + EVT VT) { + unsigned Op0Reg = getRegForValue(Op0); + if (Op0Reg == 0) return false; + + // Handle 'null' like i32/i64 0. + if (isa<ConstantPointerNull>(Op1)) + Op1 = Constant::getNullValue(TD.getIntPtrType(Op0->getContext())); + + // We have two options: compare with register or immediate. If the RHS of + // the compare is an immediate that we can fold into this compare, use + // CMPri, otherwise use CMPrr. + if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) { + if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CompareImmOpc)) + .addReg(Op0Reg) + .addImm(Op1C->getSExtValue()); + return true; + } + } + + unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget); + if (CompareOpc == 0) return false; + + unsigned Op1Reg = getRegForValue(Op1); + if (Op1Reg == 0) return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CompareOpc)) + .addReg(Op0Reg) + .addReg(Op1Reg); + + return true; +} + +bool X86FastISel::X86SelectCmp(const Instruction *I) { + const CmpInst *CI = cast<CmpInst>(I); + + MVT VT; + if (!isTypeLegal(I->getOperand(0)->getType(), VT)) + return false; + + unsigned ResultReg = createResultReg(&X86::GR8RegClass); + unsigned SetCCOpc; + bool SwapArgs; // false -> compare Op0, Op1. true -> compare Op1, Op0. + switch (CI->getPredicate()) { + case CmpInst::FCMP_OEQ: { + if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT)) + return false; + + unsigned EReg = createResultReg(&X86::GR8RegClass); + unsigned NPReg = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETEr), EReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(X86::SETNPr), NPReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(X86::AND8rr), ResultReg).addReg(NPReg).addReg(EReg); + UpdateValueMap(I, ResultReg); + return true; + } + case CmpInst::FCMP_UNE: { + if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT)) + return false; + + unsigned NEReg = createResultReg(&X86::GR8RegClass); + unsigned PReg = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETNEr), NEReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETPr), PReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::OR8rr),ResultReg) + .addReg(PReg).addReg(NEReg); + UpdateValueMap(I, ResultReg); + return true; + } + case CmpInst::FCMP_OGT: SwapArgs = false; SetCCOpc = X86::SETAr; break; + case CmpInst::FCMP_OGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break; + case CmpInst::FCMP_OLT: SwapArgs = true; SetCCOpc = X86::SETAr; break; + case CmpInst::FCMP_OLE: SwapArgs = true; SetCCOpc = X86::SETAEr; break; + case CmpInst::FCMP_ONE: SwapArgs = false; SetCCOpc = X86::SETNEr; break; + case CmpInst::FCMP_ORD: SwapArgs = false; SetCCOpc = X86::SETNPr; break; + case CmpInst::FCMP_UNO: SwapArgs = false; SetCCOpc = X86::SETPr; break; + case CmpInst::FCMP_UEQ: SwapArgs = false; SetCCOpc = X86::SETEr; break; + case CmpInst::FCMP_UGT: SwapArgs = true; SetCCOpc = X86::SETBr; break; + case CmpInst::FCMP_UGE: SwapArgs = true; SetCCOpc = X86::SETBEr; break; + case CmpInst::FCMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr; break; + case CmpInst::FCMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break; + + case CmpInst::ICMP_EQ: SwapArgs = false; SetCCOpc = X86::SETEr; break; + case CmpInst::ICMP_NE: SwapArgs = false; SetCCOpc = X86::SETNEr; break; + case CmpInst::ICMP_UGT: SwapArgs = false; SetCCOpc = X86::SETAr; break; + case CmpInst::ICMP_UGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break; + case CmpInst::ICMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr; break; + case CmpInst::ICMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break; + case CmpInst::ICMP_SGT: SwapArgs = false; SetCCOpc = X86::SETGr; break; + case CmpInst::ICMP_SGE: SwapArgs = false; SetCCOpc = X86::SETGEr; break; + case CmpInst::ICMP_SLT: SwapArgs = false; SetCCOpc = X86::SETLr; break; + case CmpInst::ICMP_SLE: SwapArgs = false; SetCCOpc = X86::SETLEr; break; + default: + return false; + } + + const Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1); + if (SwapArgs) + std::swap(Op0, Op1); + + // Emit a compare of Op0/Op1. + if (!X86FastEmitCompare(Op0, Op1, VT)) + return false; + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(SetCCOpc), ResultReg); + UpdateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::X86SelectZExt(const Instruction *I) { + // Handle zero-extension from i1 to i8, which is common. + if (!I->getOperand(0)->getType()->isIntegerTy(1)) + return false; + + EVT DstVT = TLI.getValueType(I->getType()); + if (!TLI.isTypeLegal(DstVT)) + return false; + + unsigned ResultReg = getRegForValue(I->getOperand(0)); + if (ResultReg == 0) + return false; + + // Set the high bits to zero. + ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); + if (ResultReg == 0) + return false; + + if (DstVT != MVT::i8) { + ResultReg = FastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND, + ResultReg, /*Kill=*/true); + if (ResultReg == 0) + return false; + } + + UpdateValueMap(I, ResultReg); + return true; +} + + +bool X86FastISel::X86SelectBranch(const Instruction *I) { + // Unconditional branches are selected by tablegen-generated code. + // Handle a conditional branch. + const BranchInst *BI = cast<BranchInst>(I); + MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; + MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; + + // Fold the common case of a conditional branch with a comparison + // in the same block (values defined on other blocks may not have + // initialized registers). + if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) { + if (CI->hasOneUse() && CI->getParent() == I->getParent()) { + EVT VT = TLI.getValueType(CI->getOperand(0)->getType()); + + // Try to take advantage of fallthrough opportunities. + CmpInst::Predicate Predicate = CI->getPredicate(); + if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { + std::swap(TrueMBB, FalseMBB); + Predicate = CmpInst::getInversePredicate(Predicate); + } + + bool SwapArgs; // false -> compare Op0, Op1. true -> compare Op1, Op0. + unsigned BranchOpc; // Opcode to jump on, e.g. "X86::JA" + + switch (Predicate) { + case CmpInst::FCMP_OEQ: + std::swap(TrueMBB, FalseMBB); + Predicate = CmpInst::FCMP_UNE; + // FALL THROUGH + case CmpInst::FCMP_UNE: SwapArgs = false; BranchOpc = X86::JNE_4; break; + case CmpInst::FCMP_OGT: SwapArgs = false; BranchOpc = X86::JA_4; break; + case CmpInst::FCMP_OGE: SwapArgs = false; BranchOpc = X86::JAE_4; break; + case CmpInst::FCMP_OLT: SwapArgs = true; BranchOpc = X86::JA_4; break; + case CmpInst::FCMP_OLE: SwapArgs = true; BranchOpc = X86::JAE_4; break; + case CmpInst::FCMP_ONE: SwapArgs = false; BranchOpc = X86::JNE_4; break; + case CmpInst::FCMP_ORD: SwapArgs = false; BranchOpc = X86::JNP_4; break; + case CmpInst::FCMP_UNO: SwapArgs = false; BranchOpc = X86::JP_4; break; + case CmpInst::FCMP_UEQ: SwapArgs = false; BranchOpc = X86::JE_4; break; + case CmpInst::FCMP_UGT: SwapArgs = true; BranchOpc = X86::JB_4; break; + case CmpInst::FCMP_UGE: SwapArgs = true; BranchOpc = X86::JBE_4; break; + case CmpInst::FCMP_ULT: SwapArgs = false; BranchOpc = X86::JB_4; break; + case CmpInst::FCMP_ULE: SwapArgs = false; BranchOpc = X86::JBE_4; break; + + case CmpInst::ICMP_EQ: SwapArgs = false; BranchOpc = X86::JE_4; break; + case CmpInst::ICMP_NE: SwapArgs = false; BranchOpc = X86::JNE_4; break; + case CmpInst::ICMP_UGT: SwapArgs = false; BranchOpc = X86::JA_4; break; + case CmpInst::ICMP_UGE: SwapArgs = false; BranchOpc = X86::JAE_4; break; + case CmpInst::ICMP_ULT: SwapArgs = false; BranchOpc = X86::JB_4; break; + case CmpInst::ICMP_ULE: SwapArgs = false; BranchOpc = X86::JBE_4; break; + case CmpInst::ICMP_SGT: SwapArgs = false; BranchOpc = X86::JG_4; break; + case CmpInst::ICMP_SGE: SwapArgs = false; BranchOpc = X86::JGE_4; break; + case CmpInst::ICMP_SLT: SwapArgs = false; BranchOpc = X86::JL_4; break; + case CmpInst::ICMP_SLE: SwapArgs = false; BranchOpc = X86::JLE_4; break; + default: + return false; + } + + const Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1); + if (SwapArgs) + std::swap(Op0, Op1); + + // Emit a compare of the LHS and RHS, setting the flags. + if (!X86FastEmitCompare(Op0, Op1, VT)) + return false; + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BranchOpc)) + .addMBB(TrueMBB); + + if (Predicate == CmpInst::FCMP_UNE) { + // X86 requires a second branch to handle UNE (and OEQ, + // which is mapped to UNE above). + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::JP_4)) + .addMBB(TrueMBB); + } + + FastEmitBranch(FalseMBB, DL); + FuncInfo.MBB->addSuccessor(TrueMBB); + return true; + } + } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { + // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which + // typically happen for _Bool and C++ bools. + MVT SourceVT; + if (TI->hasOneUse() && TI->getParent() == I->getParent() && + isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) { + unsigned TestOpc = 0; + switch (SourceVT.SimpleTy) { + default: break; + case MVT::i8: TestOpc = X86::TEST8ri; break; + case MVT::i16: TestOpc = X86::TEST16ri; break; + case MVT::i32: TestOpc = X86::TEST32ri; break; + case MVT::i64: TestOpc = X86::TEST64ri32; break; + } + if (TestOpc) { + unsigned OpReg = getRegForValue(TI->getOperand(0)); + if (OpReg == 0) return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TestOpc)) + .addReg(OpReg).addImm(1); + + unsigned JmpOpc = X86::JNE_4; + if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { + std::swap(TrueMBB, FalseMBB); + JmpOpc = X86::JE_4; + } + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(JmpOpc)) + .addMBB(TrueMBB); + FastEmitBranch(FalseMBB, DL); + FuncInfo.MBB->addSuccessor(TrueMBB); + return true; + } + } + } + + // Otherwise do a clumsy setcc and re-test it. + // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used + // in an explicit cast, so make sure to handle that correctly. + unsigned OpReg = getRegForValue(BI->getCondition()); + if (OpReg == 0) return false; + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TEST8ri)) + .addReg(OpReg).addImm(1); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::JNE_4)) + .addMBB(TrueMBB); + FastEmitBranch(FalseMBB, DL); + FuncInfo.MBB->addSuccessor(TrueMBB); + return true; +} + +bool X86FastISel::X86SelectShift(const Instruction *I) { + unsigned CReg = 0, OpReg = 0; + const TargetRegisterClass *RC = NULL; + if (I->getType()->isIntegerTy(8)) { + CReg = X86::CL; + RC = &X86::GR8RegClass; + switch (I->getOpcode()) { + case Instruction::LShr: OpReg = X86::SHR8rCL; break; + case Instruction::AShr: OpReg = X86::SAR8rCL; break; + case Instruction::Shl: OpReg = X86::SHL8rCL; break; + default: return false; + } + } else if (I->getType()->isIntegerTy(16)) { + CReg = X86::CX; + RC = &X86::GR16RegClass; + switch (I->getOpcode()) { + case Instruction::LShr: OpReg = X86::SHR16rCL; break; + case Instruction::AShr: OpReg = X86::SAR16rCL; break; + case Instruction::Shl: OpReg = X86::SHL16rCL; break; + default: return false; + } + } else if (I->getType()->isIntegerTy(32)) { + CReg = X86::ECX; + RC = &X86::GR32RegClass; + switch (I->getOpcode()) { + case Instruction::LShr: OpReg = X86::SHR32rCL; break; + case Instruction::AShr: OpReg = X86::SAR32rCL; break; + case Instruction::Shl: OpReg = X86::SHL32rCL; break; + default: return false; + } + } else if (I->getType()->isIntegerTy(64)) { + CReg = X86::RCX; + RC = &X86::GR64RegClass; + switch (I->getOpcode()) { + case Instruction::LShr: OpReg = X86::SHR64rCL; break; + case Instruction::AShr: OpReg = X86::SAR64rCL; break; + case Instruction::Shl: OpReg = X86::SHL64rCL; break; + default: return false; + } + } else { + return false; + } + + MVT VT; + if (!isTypeLegal(I->getType(), VT)) + return false; + + unsigned Op0Reg = getRegForValue(I->getOperand(0)); + if (Op0Reg == 0) return false; + + unsigned Op1Reg = getRegForValue(I->getOperand(1)); + if (Op1Reg == 0) return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + CReg).addReg(Op1Reg); + + // The shift instruction uses X86::CL. If we defined a super-register + // of X86::CL, emit a subreg KILL to precisely describe what we're doing here. + if (CReg != X86::CL) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TargetOpcode::KILL), X86::CL) + .addReg(CReg, RegState::Kill); + + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpReg), ResultReg) + .addReg(Op0Reg); + UpdateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::X86SelectDivRem(const Instruction *I) { + const static unsigned NumTypes = 4; // i8, i16, i32, i64 + const static unsigned NumOps = 4; // SDiv, SRem, UDiv, URem + const static bool S = true; // IsSigned + const static bool U = false; // !IsSigned + const static unsigned Copy = TargetOpcode::COPY; + // For the X86 DIV/IDIV instruction, in most cases the dividend + // (numerator) must be in a specific register pair highreg:lowreg, + // producing the quotient in lowreg and the remainder in highreg. + // For most data types, to set up the instruction, the dividend is + // copied into lowreg, and lowreg is sign-extended or zero-extended + // into highreg. The exception is i8, where the dividend is defined + // as a single register rather than a register pair, and we + // therefore directly sign-extend or zero-extend the dividend into + // lowreg, instead of copying, and ignore the highreg. + const static struct DivRemEntry { + // The following portion depends only on the data type. + const TargetRegisterClass *RC; + unsigned LowInReg; // low part of the register pair + unsigned HighInReg; // high part of the register pair + // The following portion depends on both the data type and the operation. + struct DivRemResult { + unsigned OpDivRem; // The specific DIV/IDIV opcode to use. + unsigned OpSignExtend; // Opcode for sign-extending lowreg into + // highreg, or copying a zero into highreg. + unsigned OpCopy; // Opcode for copying dividend into lowreg, or + // zero/sign-extending into lowreg for i8. + unsigned DivRemResultReg; // Register containing the desired result. + bool IsOpSigned; // Whether to use signed or unsigned form. + } ResultTable[NumOps]; + } OpTable[NumTypes] = { + { &X86::GR8RegClass, X86::AX, 0, { + { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AL, S }, // SDiv + { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AH, S }, // SRem + { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AL, U }, // UDiv + { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AH, U }, // URem + } + }, // i8 + { &X86::GR16RegClass, X86::AX, X86::DX, { + { X86::IDIV16r, X86::CWD, Copy, X86::AX, S }, // SDiv + { X86::IDIV16r, X86::CWD, Copy, X86::DX, S }, // SRem + { X86::DIV16r, X86::MOV16r0, Copy, X86::AX, U }, // UDiv + { X86::DIV16r, X86::MOV16r0, Copy, X86::DX, U }, // URem + } + }, // i16 + { &X86::GR32RegClass, X86::EAX, X86::EDX, { + { X86::IDIV32r, X86::CDQ, Copy, X86::EAX, S }, // SDiv + { X86::IDIV32r, X86::CDQ, Copy, X86::EDX, S }, // SRem + { X86::DIV32r, X86::MOV32r0, Copy, X86::EAX, U }, // UDiv + { X86::DIV32r, X86::MOV32r0, Copy, X86::EDX, U }, // URem + } + }, // i32 + { &X86::GR64RegClass, X86::RAX, X86::RDX, { + { X86::IDIV64r, X86::CQO, Copy, X86::RAX, S }, // SDiv + { X86::IDIV64r, X86::CQO, Copy, X86::RDX, S }, // SRem + { X86::DIV64r, X86::MOV64r0, Copy, X86::RAX, U }, // UDiv + { X86::DIV64r, X86::MOV64r0, Copy, X86::RDX, U }, // URem + } + }, // i64 + }; + + MVT VT; + if (!isTypeLegal(I->getType(), VT)) + return false; + + unsigned TypeIndex, OpIndex; + switch (VT.SimpleTy) { + default: return false; + case MVT::i8: TypeIndex = 0; break; + case MVT::i16: TypeIndex = 1; break; + case MVT::i32: TypeIndex = 2; break; + case MVT::i64: TypeIndex = 3; + if (!Subtarget->is64Bit()) + return false; + break; + } + + switch (I->getOpcode()) { + default: llvm_unreachable("Unexpected div/rem opcode"); + case Instruction::SDiv: OpIndex = 0; break; + case Instruction::SRem: OpIndex = 1; break; + case Instruction::UDiv: OpIndex = 2; break; + case Instruction::URem: OpIndex = 3; break; + } + + const DivRemEntry &TypeEntry = OpTable[TypeIndex]; + const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex]; + unsigned Op0Reg = getRegForValue(I->getOperand(0)); + if (Op0Reg == 0) + return false; + unsigned Op1Reg = getRegForValue(I->getOperand(1)); + if (Op1Reg == 0) + return false; + + // Move op0 into low-order input register. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg); + // Zero-extend or sign-extend into high-order input register. + if (OpEntry.OpSignExtend) { + if (OpEntry.IsOpSigned) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(OpEntry.OpSignExtend)); + else + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(OpEntry.OpSignExtend), TypeEntry.HighInReg); + } + // Generate the DIV/IDIV instruction. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(OpEntry.OpDivRem)).addReg(Op1Reg); + // Copy output register into result register. + unsigned ResultReg = createResultReg(TypeEntry.RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Copy), ResultReg).addReg(OpEntry.DivRemResultReg); + UpdateValueMap(I, ResultReg); + + return true; +} + +bool X86FastISel::X86SelectSelect(const Instruction *I) { + MVT VT; + if (!isTypeLegal(I->getType(), VT)) + return false; + + // We only use cmov here, if we don't have a cmov instruction bail. + if (!Subtarget->hasCMov()) return false; + + unsigned Opc = 0; + const TargetRegisterClass *RC = NULL; + if (VT == MVT::i16) { + Opc = X86::CMOVE16rr; + RC = &X86::GR16RegClass; + } else if (VT == MVT::i32) { + Opc = X86::CMOVE32rr; + RC = &X86::GR32RegClass; + } else if (VT == MVT::i64) { + Opc = X86::CMOVE64rr; + RC = &X86::GR64RegClass; + } else { + return false; + } + + unsigned Op0Reg = getRegForValue(I->getOperand(0)); + if (Op0Reg == 0) return false; + unsigned Op1Reg = getRegForValue(I->getOperand(1)); + if (Op1Reg == 0) return false; + unsigned Op2Reg = getRegForValue(I->getOperand(2)); + if (Op2Reg == 0) return false; + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TEST8rr)) + .addReg(Op0Reg).addReg(Op0Reg); + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg) + .addReg(Op1Reg).addReg(Op2Reg); + UpdateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::X86SelectFPExt(const Instruction *I) { + // fpext from float to double. + if (X86ScalarSSEf64 && + I->getType()->isDoubleTy()) { + const Value *V = I->getOperand(0); + if (V->getType()->isFloatTy()) { + unsigned OpReg = getRegForValue(V); + if (OpReg == 0) return false; + unsigned ResultReg = createResultReg(&X86::FR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(X86::CVTSS2SDrr), ResultReg) + .addReg(OpReg); + UpdateValueMap(I, ResultReg); + return true; + } + } + + return false; +} + +bool X86FastISel::X86SelectFPTrunc(const Instruction *I) { + if (X86ScalarSSEf64) { + if (I->getType()->isFloatTy()) { + const Value *V = I->getOperand(0); + if (V->getType()->isDoubleTy()) { + unsigned OpReg = getRegForValue(V); + if (OpReg == 0) return false; + unsigned ResultReg = createResultReg(&X86::FR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(X86::CVTSD2SSrr), ResultReg) + .addReg(OpReg); + UpdateValueMap(I, ResultReg); + return true; + } + } + } + + return false; +} + +bool X86FastISel::X86SelectTrunc(const Instruction *I) { + EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()); + EVT DstVT = TLI.getValueType(I->getType()); + + // This code only handles truncation to byte. + if (DstVT != MVT::i8 && DstVT != MVT::i1) + return false; + if (!TLI.isTypeLegal(SrcVT)) + return false; + + unsigned InputReg = getRegForValue(I->getOperand(0)); + if (!InputReg) + // Unhandled operand. Halt "fast" selection and bail. + return false; + + if (SrcVT == MVT::i8) { + // Truncate from i8 to i1; no code needed. + UpdateValueMap(I, InputReg); + return true; + } + + if (!Subtarget->is64Bit()) { + // If we're on x86-32; we can't extract an i8 from a general register. + // First issue a copy to GR16_ABCD or GR32_ABCD. + const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16) ? + (const TargetRegisterClass*)&X86::GR16_ABCDRegClass : + (const TargetRegisterClass*)&X86::GR32_ABCDRegClass; + unsigned CopyReg = createResultReg(CopyRC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + CopyReg).addReg(InputReg); + InputReg = CopyReg; + } + + // Issue an extract_subreg. + unsigned ResultReg = FastEmitInst_extractsubreg(MVT::i8, + InputReg, /*Kill=*/true, + X86::sub_8bit); + if (!ResultReg) + return false; + + UpdateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::IsMemcpySmall(uint64_t Len) { + return Len <= (Subtarget->is64Bit() ? 32 : 16); +} + +bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, + X86AddressMode SrcAM, uint64_t Len) { + + // Make sure we don't bloat code by inlining very large memcpy's. + if (!IsMemcpySmall(Len)) + return false; + + bool i64Legal = Subtarget->is64Bit(); + + // We don't care about alignment here since we just emit integer accesses. + while (Len) { + MVT VT; + if (Len >= 8 && i64Legal) + VT = MVT::i64; + else if (Len >= 4) + VT = MVT::i32; + else if (Len >= 2) + VT = MVT::i16; + else { + VT = MVT::i8; + } + + unsigned Reg; + bool RV = X86FastEmitLoad(VT, SrcAM, Reg); + RV &= X86FastEmitStore(VT, Reg, DestAM); + assert(RV && "Failed to emit load or store??"); + + unsigned Size = VT.getSizeInBits()/8; + Len -= Size; + DestAM.Disp += Size; + SrcAM.Disp += Size; + } + + return true; +} + +bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { + // FIXME: Handle more intrinsics. + switch (I.getIntrinsicID()) { + default: return false; + case Intrinsic::memcpy: { + const MemCpyInst &MCI = cast<MemCpyInst>(I); + // Don't handle volatile or variable length memcpys. + if (MCI.isVolatile()) + return false; + + if (isa<ConstantInt>(MCI.getLength())) { + // Small memcpy's are common enough that we want to do them + // without a call if possible. + uint64_t Len = cast<ConstantInt>(MCI.getLength())->getZExtValue(); + if (IsMemcpySmall(Len)) { + X86AddressMode DestAM, SrcAM; + if (!X86SelectAddress(MCI.getRawDest(), DestAM) || + !X86SelectAddress(MCI.getRawSource(), SrcAM)) + return false; + TryEmitSmallMemcpy(DestAM, SrcAM, Len); + return true; + } + } + + unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; + if (!MCI.getLength()->getType()->isIntegerTy(SizeWidth)) + return false; + + if (MCI.getSourceAddressSpace() > 255 || MCI.getDestAddressSpace() > 255) + return false; + + return DoSelectCall(&I, "memcpy"); + } + case Intrinsic::memset: { + const MemSetInst &MSI = cast<MemSetInst>(I); + + if (MSI.isVolatile()) + return false; + + unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; + if (!MSI.getLength()->getType()->isIntegerTy(SizeWidth)) + return false; + + if (MSI.getDestAddressSpace() > 255) + return false; + + return DoSelectCall(&I, "memset"); + } + case Intrinsic::stackprotector: { + // Emit code to store the stack guard onto the stack. + EVT PtrTy = TLI.getPointerTy(); + + const Value *Op1 = I.getArgOperand(0); // The guard's value. + const AllocaInst *Slot = cast<AllocaInst>(I.getArgOperand(1)); + + // Grab the frame index. + X86AddressMode AM; + if (!X86SelectAddress(Slot, AM)) return false; + if (!X86FastEmitStore(PtrTy, Op1, AM)) return false; + return true; + } + case Intrinsic::dbg_declare: { + const DbgDeclareInst *DI = cast<DbgDeclareInst>(&I); + X86AddressMode AM; + assert(DI->getAddress() && "Null address should be checked earlier!"); + if (!X86SelectAddress(DI->getAddress(), AM)) + return false; + const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE); + // FIXME may need to add RegState::Debug to any registers produced, + // although ESP/EBP should be the only ones at the moment. + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II), AM). + addImm(0).addMetadata(DI->getVariable()); + return true; + } + case Intrinsic::trap: { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TRAP)); + return true; + } + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: { + // FIXME: Should fold immediates. + + // Replace "add with overflow" intrinsics with an "add" instruction followed + // by a seto/setc instruction. + const Function *Callee = I.getCalledFunction(); + Type *RetTy = + cast<StructType>(Callee->getReturnType())->getTypeAtIndex(unsigned(0)); + + MVT VT; + if (!isTypeLegal(RetTy, VT)) + return false; + + const Value *Op1 = I.getArgOperand(0); + const Value *Op2 = I.getArgOperand(1); + unsigned Reg1 = getRegForValue(Op1); + unsigned Reg2 = getRegForValue(Op2); + + if (Reg1 == 0 || Reg2 == 0) + // FIXME: Handle values *not* in registers. + return false; + + unsigned OpC = 0; + if (VT == MVT::i32) + OpC = X86::ADD32rr; + else if (VT == MVT::i64) + OpC = X86::ADD64rr; + else + return false; + + // The call to CreateRegs builds two sequential registers, to store the + // both the returned values. + unsigned ResultReg = FuncInfo.CreateRegs(I.getType()); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpC), ResultReg) + .addReg(Reg1).addReg(Reg2); + + unsigned Opc = X86::SETBr; + if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow) + Opc = X86::SETOr; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg+1); + + UpdateValueMap(&I, ResultReg, 2); + return true; + } + } +} + +bool X86FastISel::FastLowerArguments() { + if (!FuncInfo.CanLowerReturn) + return false; + + if (Subtarget->isTargetWin64()) + return false; + + const Function *F = FuncInfo.Fn; + if (F->isVarArg()) + return false; + + CallingConv::ID CC = F->getCallingConv(); + if (CC != CallingConv::C) + return false; + + if (!Subtarget->is64Bit()) + return false; + + // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments. + unsigned Idx = 1; + for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); + I != E; ++I, ++Idx) { + if (Idx > 6) + return false; + + if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) || + F->getAttributes().hasAttribute(Idx, Attribute::InReg) || + F->getAttributes().hasAttribute(Idx, Attribute::StructRet) || + F->getAttributes().hasAttribute(Idx, Attribute::Nest)) + return false; + + Type *ArgTy = I->getType(); + if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy()) + return false; + + EVT ArgVT = TLI.getValueType(ArgTy); + if (!ArgVT.isSimple()) return false; + switch (ArgVT.getSimpleVT().SimpleTy) { + case MVT::i32: + case MVT::i64: + break; + default: + return false; + } + } + + static const uint16_t GPR32ArgRegs[] = { + X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D + }; + static const uint16_t GPR64ArgRegs[] = { + X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9 + }; + + Idx = 0; + const TargetRegisterClass *RC32 = TLI.getRegClassFor(MVT::i32); + const TargetRegisterClass *RC64 = TLI.getRegClassFor(MVT::i64); + for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); + I != E; ++I, ++Idx) { + if (I->use_empty()) + continue; + bool is32Bit = TLI.getValueType(I->getType()) == MVT::i32; + const TargetRegisterClass *RC = is32Bit ? RC32 : RC64; + unsigned SrcReg = is32Bit ? GPR32ArgRegs[Idx] : GPR64ArgRegs[Idx]; + unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); + // FIXME: Unfortunately it's necessary to emit a copy from the livein copy. + // Without this, EmitLiveInCopies may eliminate the livein if its only + // use is a bitcast (which isn't turned into an instruction). + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + ResultReg).addReg(DstReg, getKillRegState(true)); + UpdateValueMap(I, ResultReg); + } + return true; +} + +bool X86FastISel::X86SelectCall(const Instruction *I) { + const CallInst *CI = cast<CallInst>(I); + const Value *Callee = CI->getCalledValue(); + + // Can't handle inline asm yet. + if (isa<InlineAsm>(Callee)) + return false; + + // Handle intrinsic calls. + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) + return X86VisitIntrinsicCall(*II); + + // Allow SelectionDAG isel to handle tail calls. + if (cast<CallInst>(I)->isTailCall()) + return false; + + return DoSelectCall(I, 0); +} + +static unsigned computeBytesPoppedByCallee(const X86Subtarget &Subtarget, + const ImmutableCallSite &CS) { + if (Subtarget.is64Bit()) + return 0; + if (Subtarget.isTargetWindows()) + return 0; + CallingConv::ID CC = CS.getCallingConv(); + if (CC == CallingConv::Fast || CC == CallingConv::GHC) + return 0; + if (!CS.paramHasAttr(1, Attribute::StructRet)) + return 0; + if (CS.paramHasAttr(1, Attribute::InReg)) + return 0; + return 4; +} + +// Select either a call, or an llvm.memcpy/memmove/memset intrinsic +bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { + const CallInst *CI = cast<CallInst>(I); + const Value *Callee = CI->getCalledValue(); + + // Handle only C and fastcc calling conventions for now. + ImmutableCallSite CS(CI); + CallingConv::ID CC = CS.getCallingConv(); + if (CC != CallingConv::C && CC != CallingConv::Fast && + CC != CallingConv::X86_FastCall) + return false; + + // fastcc with -tailcallopt is intended to provide a guaranteed + // tail call optimization. Fastisel doesn't know how to do that. + if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) + return false; + + PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType()); + FunctionType *FTy = cast<FunctionType>(PT->getElementType()); + bool isVarArg = FTy->isVarArg(); + + // Don't know how to handle Win64 varargs yet. Nothing special needed for + // x86-32. Special handling for x86-64 is implemented. + if (isVarArg && Subtarget->isTargetWin64()) + return false; + + // Fast-isel doesn't know about callee-pop yet. + if (X86::isCalleePop(CC, Subtarget->is64Bit(), isVarArg, + TM.Options.GuaranteedTailCallOpt)) + return false; + + // Check whether the function can return without sret-demotion. + SmallVector<ISD::OutputArg, 4> Outs; + GetReturnInfo(I->getType(), CS.getAttributes(), Outs, TLI); + bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(), + *FuncInfo.MF, FTy->isVarArg(), + Outs, FTy->getContext()); + if (!CanLowerReturn) + return false; + + // Materialize callee address in a register. FIXME: GV address can be + // handled with a CALLpcrel32 instead. + X86AddressMode CalleeAM; + if (!X86SelectCallAddress(Callee, CalleeAM)) + return false; + unsigned CalleeOp = 0; + const GlobalValue *GV = 0; + if (CalleeAM.GV != 0) { + GV = CalleeAM.GV; + } else if (CalleeAM.Base.Reg != 0) { + CalleeOp = CalleeAM.Base.Reg; + } else + return false; + + // Deal with call operands first. + SmallVector<const Value *, 8> ArgVals; + SmallVector<unsigned, 8> Args; + SmallVector<MVT, 8> ArgVTs; + SmallVector<ISD::ArgFlagsTy, 8> ArgFlags; + unsigned arg_size = CS.arg_size(); + Args.reserve(arg_size); + ArgVals.reserve(arg_size); + ArgVTs.reserve(arg_size); + ArgFlags.reserve(arg_size); + for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); + i != e; ++i) { + // If we're lowering a mem intrinsic instead of a regular call, skip the + // last two arguments, which should not passed to the underlying functions. + if (MemIntName && e-i <= 2) + break; + Value *ArgVal = *i; + ISD::ArgFlagsTy Flags; + unsigned AttrInd = i - CS.arg_begin() + 1; + if (CS.paramHasAttr(AttrInd, Attribute::SExt)) + Flags.setSExt(); + if (CS.paramHasAttr(AttrInd, Attribute::ZExt)) + Flags.setZExt(); + + if (CS.paramHasAttr(AttrInd, Attribute::ByVal)) { + PointerType *Ty = cast<PointerType>(ArgVal->getType()); + Type *ElementTy = Ty->getElementType(); + unsigned FrameSize = TD.getTypeAllocSize(ElementTy); + unsigned FrameAlign = CS.getParamAlignment(AttrInd); + if (!FrameAlign) + FrameAlign = TLI.getByValTypeAlignment(ElementTy); + Flags.setByVal(); + Flags.setByValSize(FrameSize); + Flags.setByValAlign(FrameAlign); + if (!IsMemcpySmall(FrameSize)) + return false; + } + + if (CS.paramHasAttr(AttrInd, Attribute::InReg)) + Flags.setInReg(); + if (CS.paramHasAttr(AttrInd, Attribute::Nest)) + Flags.setNest(); + + // If this is an i1/i8/i16 argument, promote to i32 to avoid an extra + // instruction. This is safe because it is common to all fastisel supported + // calling conventions on x86. + if (ConstantInt *CI = dyn_cast<ConstantInt>(ArgVal)) { + if (CI->getBitWidth() == 1 || CI->getBitWidth() == 8 || + CI->getBitWidth() == 16) { + if (Flags.isSExt()) + ArgVal = ConstantExpr::getSExt(CI,Type::getInt32Ty(CI->getContext())); + else + ArgVal = ConstantExpr::getZExt(CI,Type::getInt32Ty(CI->getContext())); + } + } + + unsigned ArgReg; + + // Passing bools around ends up doing a trunc to i1 and passing it. + // Codegen this as an argument + "and 1". + if (ArgVal->getType()->isIntegerTy(1) && isa<TruncInst>(ArgVal) && + cast<TruncInst>(ArgVal)->getParent() == I->getParent() && + ArgVal->hasOneUse()) { + ArgVal = cast<TruncInst>(ArgVal)->getOperand(0); + ArgReg = getRegForValue(ArgVal); + if (ArgReg == 0) return false; + + MVT ArgVT; + if (!isTypeLegal(ArgVal->getType(), ArgVT)) return false; + + ArgReg = FastEmit_ri(ArgVT, ArgVT, ISD::AND, ArgReg, + ArgVal->hasOneUse(), 1); + } else { + ArgReg = getRegForValue(ArgVal); + } + + if (ArgReg == 0) return false; + + Type *ArgTy = ArgVal->getType(); + MVT ArgVT; + if (!isTypeLegal(ArgTy, ArgVT)) + return false; + if (ArgVT == MVT::x86mmx) + return false; + unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy); + Flags.setOrigAlign(OriginalAlignment); + + Args.push_back(ArgReg); + ArgVals.push_back(ArgVal); + ArgVTs.push_back(ArgVT); + ArgFlags.push_back(Flags); + } + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, ArgLocs, + I->getParent()->getContext()); + + // Allocate shadow area for Win64 + if (Subtarget->isTargetWin64()) + CCInfo.AllocateStack(32, 8); + + CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_X86); + + // Get a count of how many bytes are to be pushed on the stack. + unsigned NumBytes = CCInfo.getNextStackOffset(); + + // Issue CALLSEQ_START + unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackDown)) + .addImm(NumBytes); + + // Process argument: walk the register/memloc assignments, inserting + // copies / loads. + SmallVector<unsigned, 4> RegArgs; + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + unsigned Arg = Args[VA.getValNo()]; + EVT ArgVT = ArgVTs[VA.getValNo()]; + + // Promote the value if needed. + switch (VA.getLocInfo()) { + case CCValAssign::Full: break; + case CCValAssign::SExt: { + assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && + "Unexpected extend"); + bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), + Arg, ArgVT, Arg); + assert(Emitted && "Failed to emit a sext!"); (void)Emitted; + ArgVT = VA.getLocVT(); + break; + } + case CCValAssign::ZExt: { + assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && + "Unexpected extend"); + bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), + Arg, ArgVT, Arg); + assert(Emitted && "Failed to emit a zext!"); (void)Emitted; + ArgVT = VA.getLocVT(); + break; + } + case CCValAssign::AExt: { + assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && + "Unexpected extend"); + bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), + Arg, ArgVT, Arg); + if (!Emitted) + Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), + Arg, ArgVT, Arg); + if (!Emitted) + Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), + Arg, ArgVT, Arg); + + assert(Emitted && "Failed to emit a aext!"); (void)Emitted; + ArgVT = VA.getLocVT(); + break; + } + case CCValAssign::BCvt: { + unsigned BC = FastEmit_r(ArgVT.getSimpleVT(), VA.getLocVT(), + ISD::BITCAST, Arg, /*TODO: Kill=*/false); + assert(BC != 0 && "Failed to emit a bitcast!"); + Arg = BC; + ArgVT = VA.getLocVT(); + break; + } + case CCValAssign::VExt: + // VExt has not been implemented, so this should be impossible to reach + // for now. However, fallback to Selection DAG isel once implemented. + return false; + case CCValAssign::Indirect: + // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully + // support this. + return false; + } + + if (VA.isRegLoc()) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + VA.getLocReg()).addReg(Arg); + RegArgs.push_back(VA.getLocReg()); + } else { + unsigned LocMemOffset = VA.getLocMemOffset(); + X86AddressMode AM; + AM.Base.Reg = RegInfo->getStackRegister(); + AM.Disp = LocMemOffset; + const Value *ArgVal = ArgVals[VA.getValNo()]; + ISD::ArgFlagsTy Flags = ArgFlags[VA.getValNo()]; + + if (Flags.isByVal()) { + X86AddressMode SrcAM; + SrcAM.Base.Reg = Arg; + bool Res = TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()); + assert(Res && "memcpy length already checked!"); (void)Res; + } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) { + // If this is a really simple value, emit this with the Value* version + // of X86FastEmitStore. If it isn't simple, we don't want to do this, + // as it can cause us to reevaluate the argument. + if (!X86FastEmitStore(ArgVT, ArgVal, AM)) + return false; + } else { + if (!X86FastEmitStore(ArgVT, Arg, AM)) + return false; + } + } + } + + // ELF / PIC requires GOT in the EBX register before function calls via PLT + // GOT pointer. + if (Subtarget->isPICStyleGOT()) { + unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + X86::EBX).addReg(Base); + } + + if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) { + // Count the number of XMM registers allocated. + static const uint16_t XMMArgRegs[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; + unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::MOV8ri), + X86::AL).addImm(NumXMMRegs); + } + + // Issue the call. + MachineInstrBuilder MIB; + if (CalleeOp) { + // Register-indirect call. + unsigned CallOpc; + if (Subtarget->is64Bit()) + CallOpc = X86::CALL64r; + else + CallOpc = X86::CALL32r; + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc)) + .addReg(CalleeOp); + + } else { + // Direct call. + assert(GV && "Not a direct call"); + unsigned CallOpc; + if (Subtarget->is64Bit()) + CallOpc = X86::CALL64pcrel32; + else + CallOpc = X86::CALLpcrel32; + + // See if we need any target-specific flags on the GV operand. + unsigned char OpFlags = 0; + + // On ELF targets, in both X86-64 and X86-32 mode, direct calls to + // external symbols most go through the PLT in PIC mode. If the symbol + // has hidden or protected visibility, or if it is static or local, then + // we don't need to use the PLT - we can directly call it. + if (Subtarget->isTargetELF() && + TM.getRelocationModel() == Reloc::PIC_ && + GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { + OpFlags = X86II::MO_PLT; + } else if (Subtarget->isPICStyleStubAny() && + (GV->isDeclaration() || GV->isWeakForLinker()) && + (!Subtarget->getTargetTriple().isMacOSX() || + Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { + // PC-relative references to external symbols should go through $stub, + // unless we're building with the leopard linker or later, which + // automatically synthesizes these stubs. + OpFlags = X86II::MO_DARWIN_STUB; + } + + + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc)); + if (MemIntName) + MIB.addExternalSymbol(MemIntName, OpFlags); + else + MIB.addGlobalAddress(GV, 0, OpFlags); + } + + // Add a register mask with the call-preserved registers. + // Proper defs for return values will be added by setPhysRegsDeadExcept(). + MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv())); + + // Add an implicit use GOT pointer in EBX. + if (Subtarget->isPICStyleGOT()) + MIB.addReg(X86::EBX, RegState::Implicit); + + if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) + MIB.addReg(X86::AL, RegState::Implicit); + + // Add implicit physical register uses to the call. + for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) + MIB.addReg(RegArgs[i], RegState::Implicit); + + // Issue CALLSEQ_END + unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); + const unsigned NumBytesCallee = computeBytesPoppedByCallee(*Subtarget, CS); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackUp)) + .addImm(NumBytes).addImm(NumBytesCallee); + + // Build info for return calling conv lowering code. + // FIXME: This is practically a copy-paste from TargetLowering::LowerCallTo. + SmallVector<ISD::InputArg, 32> Ins; + SmallVector<EVT, 4> RetTys; + ComputeValueVTs(TLI, I->getType(), RetTys); + for (unsigned i = 0, e = RetTys.size(); i != e; ++i) { + EVT VT = RetTys[i]; + MVT RegisterVT = TLI.getRegisterType(I->getParent()->getContext(), VT); + unsigned NumRegs = TLI.getNumRegisters(I->getParent()->getContext(), VT); + for (unsigned j = 0; j != NumRegs; ++j) { + ISD::InputArg MyFlags; + MyFlags.VT = RegisterVT; + MyFlags.Used = !CS.getInstruction()->use_empty(); + if (CS.paramHasAttr(0, Attribute::SExt)) + MyFlags.Flags.setSExt(); + if (CS.paramHasAttr(0, Attribute::ZExt)) + MyFlags.Flags.setZExt(); + if (CS.paramHasAttr(0, Attribute::InReg)) + MyFlags.Flags.setInReg(); + Ins.push_back(MyFlags); + } + } + + // Now handle call return values. + SmallVector<unsigned, 4> UsedRegs; + SmallVector<CCValAssign, 16> RVLocs; + CCState CCRetInfo(CC, false, *FuncInfo.MF, TM, RVLocs, + I->getParent()->getContext()); + unsigned ResultReg = FuncInfo.CreateRegs(I->getType()); + CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86); + for (unsigned i = 0; i != RVLocs.size(); ++i) { + EVT CopyVT = RVLocs[i].getValVT(); + unsigned CopyReg = ResultReg + i; + + // If this is a call to a function that returns an fp value on the x87 fp + // stack, but where we prefer to use the value in xmm registers, copy it + // out as F80 and use a truncate to move it from fp stack reg to xmm reg. + if ((RVLocs[i].getLocReg() == X86::ST0 || + RVLocs[i].getLocReg() == X86::ST1)) { + if (isScalarFPTypeInSSEReg(RVLocs[i].getValVT())) { + CopyVT = MVT::f80; + CopyReg = createResultReg(&X86::RFP80RegClass); + } + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::FpPOP_RETVAL), + CopyReg); + } else { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + CopyReg).addReg(RVLocs[i].getLocReg()); + UsedRegs.push_back(RVLocs[i].getLocReg()); + } + + if (CopyVT != RVLocs[i].getValVT()) { + // Round the F80 the right size, which also moves to the appropriate xmm + // register. This is accomplished by storing the F80 value in memory and + // then loading it back. Ewww... + EVT ResVT = RVLocs[i].getValVT(); + unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64; + unsigned MemSize = ResVT.getSizeInBits()/8; + int FI = MFI.CreateStackObject(MemSize, MemSize, false); + addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Opc)), FI) + .addReg(CopyReg); + Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm; + addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Opc), ResultReg + i), FI); + } + } + + if (RVLocs.size()) + UpdateValueMap(I, ResultReg, RVLocs.size()); + + // Set all unused physreg defs as dead. + static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI); + + return true; +} + + +bool +X86FastISel::TargetSelectInstruction(const Instruction *I) { + switch (I->getOpcode()) { + default: break; + case Instruction::Load: + return X86SelectLoad(I); + case Instruction::Store: + return X86SelectStore(I); + case Instruction::Ret: + return X86SelectRet(I); + case Instruction::ICmp: + case Instruction::FCmp: + return X86SelectCmp(I); + case Instruction::ZExt: + return X86SelectZExt(I); + case Instruction::Br: + return X86SelectBranch(I); + case Instruction::Call: + return X86SelectCall(I); + case Instruction::LShr: + case Instruction::AShr: + case Instruction::Shl: + return X86SelectShift(I); + case Instruction::SDiv: + case Instruction::UDiv: + case Instruction::SRem: + case Instruction::URem: + return X86SelectDivRem(I); + case Instruction::Select: + return X86SelectSelect(I); + case Instruction::Trunc: + return X86SelectTrunc(I); + case Instruction::FPExt: + return X86SelectFPExt(I); + case Instruction::FPTrunc: + return X86SelectFPTrunc(I); + case Instruction::IntToPtr: // Deliberate fall-through. + case Instruction::PtrToInt: { + EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()); + EVT DstVT = TLI.getValueType(I->getType()); + if (DstVT.bitsGT(SrcVT)) + return X86SelectZExt(I); + if (DstVT.bitsLT(SrcVT)) + return X86SelectTrunc(I); + unsigned Reg = getRegForValue(I->getOperand(0)); + if (Reg == 0) return false; + UpdateValueMap(I, Reg); + return true; + } + } + + return false; +} + +unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { + MVT VT; + if (!isTypeLegal(C->getType(), VT)) + return 0; + + // Can't handle alternate code models yet. + if (TM.getCodeModel() != CodeModel::Small) + return 0; + + // Get opcode and regclass of the output for the given load instruction. + unsigned Opc = 0; + const TargetRegisterClass *RC = NULL; + switch (VT.SimpleTy) { + default: return 0; + case MVT::i8: + Opc = X86::MOV8rm; + RC = &X86::GR8RegClass; + break; + case MVT::i16: + Opc = X86::MOV16rm; + RC = &X86::GR16RegClass; + break; + case MVT::i32: + Opc = X86::MOV32rm; + RC = &X86::GR32RegClass; + break; + case MVT::i64: + // Must be in x86-64 mode. + Opc = X86::MOV64rm; + RC = &X86::GR64RegClass; + break; + case MVT::f32: + if (X86ScalarSSEf32) { + Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm; + RC = &X86::FR32RegClass; + } else { + Opc = X86::LD_Fp32m; + RC = &X86::RFP32RegClass; + } + break; + case MVT::f64: + if (X86ScalarSSEf64) { + Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm; + RC = &X86::FR64RegClass; + } else { + Opc = X86::LD_Fp64m; + RC = &X86::RFP64RegClass; + } + break; + case MVT::f80: + // No f80 support yet. + return 0; + } + + // Materialize addresses with LEA instructions. + if (isa<GlobalValue>(C)) { + X86AddressMode AM; + if (X86SelectAddress(C, AM)) { + // If the expression is just a basereg, then we're done, otherwise we need + // to emit an LEA. + if (AM.BaseType == X86AddressMode::RegBase && + AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == 0) + return AM.Base.Reg; + + Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; + unsigned ResultReg = createResultReg(RC); + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Opc), ResultReg), AM); + return ResultReg; + } + return 0; + } + + // MachineConstantPool wants an explicit alignment. + unsigned Align = TD.getPrefTypeAlignment(C->getType()); + if (Align == 0) { + // Alignment of vector types. FIXME! + Align = TD.getTypeAllocSize(C->getType()); + } + + // x86-32 PIC requires a PIC base register for constant pools. + unsigned PICBase = 0; + unsigned char OpFlag = 0; + if (Subtarget->isPICStyleStubPIC()) { // Not dynamic-no-pic + OpFlag = X86II::MO_PIC_BASE_OFFSET; + PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); + } else if (Subtarget->isPICStyleGOT()) { + OpFlag = X86II::MO_GOTOFF; + PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); + } else if (Subtarget->isPICStyleRIPRel() && + TM.getCodeModel() == CodeModel::Small) { + PICBase = X86::RIP; + } + + // Create the load from the constant pool. + unsigned MCPOffset = MCP.getConstantPoolIndex(C, Align); + unsigned ResultReg = createResultReg(RC); + addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Opc), ResultReg), + MCPOffset, PICBase, OpFlag); + + return ResultReg; +} + +unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) { + // Fail on dynamic allocas. At this point, getRegForValue has already + // checked its CSE maps, so if we're here trying to handle a dynamic + // alloca, we're not going to succeed. X86SelectAddress has a + // check for dynamic allocas, because it's called directly from + // various places, but TargetMaterializeAlloca also needs a check + // in order to avoid recursion between getRegForValue, + // X86SelectAddrss, and TargetMaterializeAlloca. + if (!FuncInfo.StaticAllocaMap.count(C)) + return 0; + + X86AddressMode AM; + if (!X86SelectAddress(C, AM)) + return 0; + unsigned Opc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; + const TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy()); + unsigned ResultReg = createResultReg(RC); + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Opc), ResultReg), AM); + return ResultReg; +} + +unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) { + MVT VT; + if (!isTypeLegal(CF->getType(), VT)) + return 0; + + // Get opcode and regclass for the given zero. + unsigned Opc = 0; + const TargetRegisterClass *RC = NULL; + switch (VT.SimpleTy) { + default: return 0; + case MVT::f32: + if (X86ScalarSSEf32) { + Opc = X86::FsFLD0SS; + RC = &X86::FR32RegClass; + } else { + Opc = X86::LD_Fp032; + RC = &X86::RFP32RegClass; + } + break; + case MVT::f64: + if (X86ScalarSSEf64) { + Opc = X86::FsFLD0SD; + RC = &X86::FR64RegClass; + } else { + Opc = X86::LD_Fp064; + RC = &X86::RFP64RegClass; + } + break; + case MVT::f80: + // No f80 support yet. + return 0; + } + + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg); + return ResultReg; +} + + +bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, + const LoadInst *LI) { + X86AddressMode AM; + if (!X86SelectAddress(LI->getOperand(0), AM)) + return false; + + const X86InstrInfo &XII = (const X86InstrInfo&)TII; + + unsigned Size = TD.getTypeAllocSize(LI->getType()); + unsigned Alignment = LI->getAlignment(); + + SmallVector<MachineOperand, 8> AddrOps; + AM.getFullAddress(AddrOps); + + MachineInstr *Result = + XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment); + if (Result == 0) return false; + + FuncInfo.MBB->insert(FuncInfo.InsertPt, Result); + MI->eraseFromParent(); + return true; +} + + +namespace llvm { + FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) { + return new X86FastISel(funcInfo, libInfo); + } +} diff --git a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp new file mode 100644 index 0000000..0dd034c --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp @@ -0,0 +1,253 @@ +//===-- X86FixupLEAs.cpp - use or replace LEA instructions -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass which will find instructions which +// can be re-written as LEA instructions in order to reduce pipeline +// delays for some models of the Intel Atom family. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "x86-fixup-LEAs" +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +using namespace llvm; + +STATISTIC(NumLEAs, "Number of LEA instructions created"); + +namespace { + class FixupLEAPass : public MachineFunctionPass { + enum RegUsageState { RU_NotUsed, RU_Write, RU_Read }; + static char ID; + /// \brief Loop over all of the instructions in the basic block + /// replacing applicable instructions with LEA instructions, + /// where appropriate. + bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI); + + virtual const char *getPassName() const { return "X86 Atom LEA Fixup";} + + /// \brief Given a machine register, look for the instruction + /// which writes it in the current basic block. If found, + /// try to replace it with an equivalent LEA instruction. + /// If replacement succeeds, then also process the the newly created + /// instruction. + void seekLEAFixup(MachineOperand& p, MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI); + + /// \brief Given a memory access or LEA instruction + /// whose address mode uses a base and/or index register, look for + /// an opportunity to replace the instruction which sets the base or index + /// register with an equivalent LEA instruction. + void processInstruction(MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI); + + /// \brief Determine if an instruction references a machine register + /// and, if so, whether it reads or writes the register. + RegUsageState usesRegister(MachineOperand& p, + MachineBasicBlock::iterator I); + + /// \brief Step backwards through a basic block, looking + /// for an instruction which writes a register within + /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles. + MachineBasicBlock::iterator searchBackwards(MachineOperand& p, + MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI); + + /// \brief if an instruction can be converted to an + /// equivalent LEA, insert the new instruction into the basic block + /// and return a pointer to it. Otherwise, return zero. + MachineInstr* postRAConvertToLEA(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI) const; + + public: + FixupLEAPass() : MachineFunctionPass(ID) {} + + /// \brief Loop over all of the basic blocks, + /// replacing instructions by equivalent LEA instructions + /// if needed and when possible. + virtual bool runOnMachineFunction(MachineFunction &MF); + + private: + MachineFunction *MF; + const TargetMachine *TM; + const TargetInstrInfo *TII; // Machine instruction info. + + }; + char FixupLEAPass::ID = 0; +} + +MachineInstr * +FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI) const { + MachineInstr* MI = MBBI; + MachineInstr* NewMI; + switch (MI->getOpcode()) { + case X86::MOV32rr: + case X86::MOV64rr: { + const MachineOperand& Src = MI->getOperand(1); + const MachineOperand& Dest = MI->getOperand(0); + NewMI = BuildMI(*MF, MI->getDebugLoc(), + TII->get( MI->getOpcode() == X86::MOV32rr ? X86::LEA32r : X86::LEA64r)) + .addOperand(Dest) + .addOperand(Src).addImm(1).addReg(0).addImm(0).addReg(0); + MFI->insert(MBBI, NewMI); // Insert the new inst + return NewMI; + } + case X86::ADD64ri32: + case X86::ADD64ri8: + case X86::ADD64ri32_DB: + case X86::ADD64ri8_DB: + case X86::ADD32ri: + case X86::ADD32ri8: + case X86::ADD32ri_DB: + case X86::ADD32ri8_DB: + case X86::ADD16ri: + case X86::ADD16ri8: + case X86::ADD16ri_DB: + case X86::ADD16ri8_DB: + if (!MI->getOperand(2).isImm()) { + // convertToThreeAddress will call getImm() + // which requires isImm() to be true + return 0; + } + } + return TII->convertToThreeAddress(MFI, MBBI, 0); +} + +FunctionPass *llvm::createX86FixupLEAs() { + return new FixupLEAPass(); +} + +bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { + MF = &Func; + TII = Func.getTarget().getInstrInfo(); + TM = &MF->getTarget(); + + DEBUG(dbgs() << "Start X86FixupLEAs\n";); + // Process all basic blocks. + for (MachineFunction::iterator I = Func.begin(), E = Func.end(); I != E; ++I) + processBasicBlock(Func, I); + DEBUG(dbgs() << "End X86FixupLEAs\n";); + + return true; +} + +FixupLEAPass::RegUsageState FixupLEAPass::usesRegister(MachineOperand& p, + MachineBasicBlock::iterator I) { + RegUsageState RegUsage = RU_NotUsed; + MachineInstr* MI = I; + + for (unsigned int i = 0; i < MI->getNumOperands(); ++i) { + MachineOperand& opnd = MI->getOperand(i); + if (opnd.isReg() && opnd.getReg() == p.getReg()){ + if (opnd.isDef()) + return RU_Write; + RegUsage = RU_Read; + } + } + return RegUsage; +} + +/// getPreviousInstr - Given a reference to an instruction in a basic +/// block, return a reference to the previous instruction in the block, +/// wrapping around to the last instruction of the block if the block +/// branches to itself. +static inline bool getPreviousInstr(MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI) { + if (I == MFI->begin()) { + if (MFI->isPredecessor(MFI)) { + I = --MFI->end(); + return true; + } + else + return false; + } + --I; + return true; +} + +MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p, + MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI) { + int InstrDistance = 1; + MachineBasicBlock::iterator CurInst; + static const int INSTR_DISTANCE_THRESHOLD = 5; + + CurInst = I; + bool Found; + Found = getPreviousInstr(CurInst, MFI); + while( Found && I != CurInst) { + if (CurInst->isCall() || CurInst->isInlineAsm()) + break; + if (InstrDistance > INSTR_DISTANCE_THRESHOLD) + break; // too far back to make a difference + if (usesRegister(p, CurInst) == RU_Write){ + return CurInst; + } + InstrDistance += TII->getInstrLatency(TM->getInstrItineraryData(), CurInst); + Found = getPreviousInstr(CurInst, MFI); + } + return 0; +} + +void FixupLEAPass::processInstruction(MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI) { + // Process a load, store, or LEA instruction. + MachineInstr *MI = I; + int opcode = MI->getOpcode(); + const MCInstrDesc& Desc = MI->getDesc(); + int AddrOffset = X86II::getMemoryOperandNo(Desc.TSFlags, opcode); + if (AddrOffset >= 0) { + AddrOffset += X86II::getOperandBias(Desc); + MachineOperand& p = MI->getOperand(AddrOffset + X86::AddrBaseReg); + if (p.isReg() && p.getReg() != X86::ESP) { + seekLEAFixup(p, I, MFI); + } + MachineOperand& q = MI->getOperand(AddrOffset + X86::AddrIndexReg); + if (q.isReg() && q.getReg() != X86::ESP) { + seekLEAFixup(q, I, MFI); + } + } +} + +void FixupLEAPass::seekLEAFixup(MachineOperand& p, + MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI) { + MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI); + if (MBI) { + MachineInstr* NewMI = postRAConvertToLEA(MFI, MBI); + if (NewMI) { + ++NumLEAs; + DEBUG(dbgs() << "Candidate to replace:"; MBI->dump();); + // now to replace with an equivalent LEA... + DEBUG(dbgs() << "Replaced by: "; NewMI->dump();); + MFI->erase(MBI); + MachineBasicBlock::iterator J = + static_cast<MachineBasicBlock::iterator> (NewMI); + processInstruction(J, MFI); + } + } +} + +bool FixupLEAPass::processBasicBlock(MachineFunction &MF, + MachineFunction::iterator MFI) { + + for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) + processInstruction(I, MFI); + return false; +} diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp new file mode 100644 index 0000000..0585b43 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -0,0 +1,1768 @@ +//===-- X86FloatingPoint.cpp - Floating point Reg -> Stack converter ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass which converts floating point instructions from +// pseudo registers into register stack instructions. This pass uses live +// variable information to indicate where the FPn registers are used and their +// lifetimes. +// +// The x87 hardware tracks liveness of the stack registers, so it is necessary +// to implement exact liveness tracking between basic blocks. The CFG edges are +// partitioned into bundles where the same FP registers must be live in +// identical stack positions. Instructions are inserted at the end of each basic +// block to rearrange the live registers to match the outgoing bundle. +// +// This approach avoids splitting critical edges at the potential cost of more +// live register shuffling instructions when critical edges are present. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "x86-codegen" +#include "X86.h" +#include "X86InstrInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/EdgeBundles.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include <algorithm> +using namespace llvm; + +STATISTIC(NumFXCH, "Number of fxch instructions inserted"); +STATISTIC(NumFP , "Number of floating point instructions"); + +namespace { + struct FPS : public MachineFunctionPass { + static char ID; + FPS() : MachineFunctionPass(ID) { + initializeEdgeBundlesPass(*PassRegistry::getPassRegistry()); + // This is really only to keep valgrind quiet. + // The logic in isLive() is too much for it. + memset(Stack, 0, sizeof(Stack)); + memset(RegMap, 0, sizeof(RegMap)); + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<EdgeBundles>(); + AU.addPreservedID(MachineLoopInfoID); + AU.addPreservedID(MachineDominatorsID); + MachineFunctionPass::getAnalysisUsage(AU); + } + + virtual bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { return "X86 FP Stackifier"; } + + private: + const TargetInstrInfo *TII; // Machine instruction info. + + // Two CFG edges are related if they leave the same block, or enter the same + // block. The transitive closure of an edge under this relation is a + // LiveBundle. It represents a set of CFG edges where the live FP stack + // registers must be allocated identically in the x87 stack. + // + // A LiveBundle is usually all the edges leaving a block, or all the edges + // entering a block, but it can contain more edges if critical edges are + // present. + // + // The set of live FP registers in a LiveBundle is calculated by bundleCFG, + // but the exact mapping of FP registers to stack slots is fixed later. + struct LiveBundle { + // Bit mask of live FP registers. Bit 0 = FP0, bit 1 = FP1, &c. + unsigned Mask; + + // Number of pre-assigned live registers in FixStack. This is 0 when the + // stack order has not yet been fixed. + unsigned FixCount; + + // Assigned stack order for live-in registers. + // FixStack[i] == getStackEntry(i) for all i < FixCount. + unsigned char FixStack[8]; + + LiveBundle() : Mask(0), FixCount(0) {} + + // Have the live registers been assigned a stack order yet? + bool isFixed() const { return !Mask || FixCount; } + }; + + // Numbered LiveBundle structs. LiveBundles[0] is used for all CFG edges + // with no live FP registers. + SmallVector<LiveBundle, 8> LiveBundles; + + // The edge bundle analysis provides indices into the LiveBundles vector. + EdgeBundles *Bundles; + + // Return a bitmask of FP registers in block's live-in list. + static unsigned calcLiveInMask(MachineBasicBlock *MBB) { + unsigned Mask = 0; + for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(), + E = MBB->livein_end(); I != E; ++I) { + unsigned Reg = *I - X86::FP0; + if (Reg < 8) + Mask |= 1 << Reg; + } + return Mask; + } + + // Partition all the CFG edges into LiveBundles. + void bundleCFG(MachineFunction &MF); + + MachineBasicBlock *MBB; // Current basic block + + // The hardware keeps track of how many FP registers are live, so we have + // to model that exactly. Usually, each live register corresponds to an + // FP<n> register, but when dealing with calls, returns, and inline + // assembly, it is sometimes necessary to have live scratch registers. + unsigned Stack[8]; // FP<n> Registers in each stack slot... + unsigned StackTop; // The current top of the FP stack. + + enum { + NumFPRegs = 16 // Including scratch pseudo-registers. + }; + + // For each live FP<n> register, point to its Stack[] entry. + // The first entries correspond to FP0-FP6, the rest are scratch registers + // used when we need slightly different live registers than what the + // register allocator thinks. + unsigned RegMap[NumFPRegs]; + + // Pending fixed registers - Inline assembly needs FP registers to appear + // in fixed stack slot positions. This is handled by copying FP registers + // to ST registers before the instruction, and copying back after the + // instruction. + // + // This is modeled with pending ST registers. NumPendingSTs is the number + // of ST registers (ST0-STn) we are tracking. PendingST[n] points to an FP + // register that holds the ST value. The ST registers are not moved into + // place until immediately before the instruction that needs them. + // + // It can happen that we need an ST register to be live when no FP register + // holds the value: + // + // %ST0 = COPY %FP4<kill> + // + // When that happens, we allocate a scratch FP register to hold the ST + // value. That means every register in PendingST must be live. + + unsigned NumPendingSTs; + unsigned char PendingST[8]; + + // Set up our stack model to match the incoming registers to MBB. + void setupBlockStack(); + + // Shuffle live registers to match the expectations of successor blocks. + void finishBlockStack(); + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dumpStack() const { + dbgs() << "Stack contents:"; + for (unsigned i = 0; i != StackTop; ++i) { + dbgs() << " FP" << Stack[i]; + assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!"); + } + for (unsigned i = 0; i != NumPendingSTs; ++i) + dbgs() << ", ST" << i << " in FP" << unsigned(PendingST[i]); + dbgs() << "\n"; + } +#endif + + /// getSlot - Return the stack slot number a particular register number is + /// in. + unsigned getSlot(unsigned RegNo) const { + assert(RegNo < NumFPRegs && "Regno out of range!"); + return RegMap[RegNo]; + } + + /// isLive - Is RegNo currently live in the stack? + bool isLive(unsigned RegNo) const { + unsigned Slot = getSlot(RegNo); + return Slot < StackTop && Stack[Slot] == RegNo; + } + + /// getScratchReg - Return an FP register that is not currently in use. + unsigned getScratchReg() const { + for (int i = NumFPRegs - 1; i >= 8; --i) + if (!isLive(i)) + return i; + llvm_unreachable("Ran out of scratch FP registers"); + } + + /// isScratchReg - Returns trus if RegNo is a scratch FP register. + static bool isScratchReg(unsigned RegNo) { + return RegNo > 8 && RegNo < NumFPRegs; + } + + /// getStackEntry - Return the X86::FP<n> register in register ST(i). + unsigned getStackEntry(unsigned STi) const { + if (STi >= StackTop) + report_fatal_error("Access past stack top!"); + return Stack[StackTop-1-STi]; + } + + /// getSTReg - Return the X86::ST(i) register which contains the specified + /// FP<RegNo> register. + unsigned getSTReg(unsigned RegNo) const { + return StackTop - 1 - getSlot(RegNo) + X86::ST0; + } + + // pushReg - Push the specified FP<n> register onto the stack. + void pushReg(unsigned Reg) { + assert(Reg < NumFPRegs && "Register number out of range!"); + if (StackTop >= 8) + report_fatal_error("Stack overflow!"); + Stack[StackTop] = Reg; + RegMap[Reg] = StackTop++; + } + + bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; } + void moveToTop(unsigned RegNo, MachineBasicBlock::iterator I) { + DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc(); + if (isAtTop(RegNo)) return; + + unsigned STReg = getSTReg(RegNo); + unsigned RegOnTop = getStackEntry(0); + + // Swap the slots the regs are in. + std::swap(RegMap[RegNo], RegMap[RegOnTop]); + + // Swap stack slot contents. + if (RegMap[RegOnTop] >= StackTop) + report_fatal_error("Access past stack top!"); + std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]); + + // Emit an fxch to update the runtime processors version of the state. + BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(STReg); + ++NumFXCH; + } + + void duplicateToTop(unsigned RegNo, unsigned AsReg, MachineInstr *I) { + DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc(); + unsigned STReg = getSTReg(RegNo); + pushReg(AsReg); // New register on top of stack + + BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg); + } + + /// duplicatePendingSTBeforeKill - The instruction at I is about to kill + /// RegNo. If any PendingST registers still need the RegNo value, duplicate + /// them to new scratch registers. + void duplicatePendingSTBeforeKill(unsigned RegNo, MachineInstr *I) { + for (unsigned i = 0; i != NumPendingSTs; ++i) { + if (PendingST[i] != RegNo) + continue; + unsigned SR = getScratchReg(); + DEBUG(dbgs() << "Duplicating pending ST" << i + << " in FP" << RegNo << " to FP" << SR << '\n'); + duplicateToTop(RegNo, SR, I); + PendingST[i] = SR; + } + } + + /// popStackAfter - Pop the current value off of the top of the FP stack + /// after the specified instruction. + void popStackAfter(MachineBasicBlock::iterator &I); + + /// freeStackSlotAfter - Free the specified register from the register + /// stack, so that it is no longer in a register. If the register is + /// currently at the top of the stack, we just pop the current instruction, + /// otherwise we store the current top-of-stack into the specified slot, + /// then pop the top of stack. + void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg); + + /// freeStackSlotBefore - Just the pop, no folding. Return the inserted + /// instruction. + MachineBasicBlock::iterator + freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo); + + /// Adjust the live registers to be the set in Mask. + void adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I); + + /// Shuffle the top FixCount stack entries such that FP reg FixStack[0] is + /// st(0), FP reg FixStack[1] is st(1) etc. + void shuffleStackTop(const unsigned char *FixStack, unsigned FixCount, + MachineBasicBlock::iterator I); + + bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB); + + void handleZeroArgFP(MachineBasicBlock::iterator &I); + void handleOneArgFP(MachineBasicBlock::iterator &I); + void handleOneArgFPRW(MachineBasicBlock::iterator &I); + void handleTwoArgFP(MachineBasicBlock::iterator &I); + void handleCompareFP(MachineBasicBlock::iterator &I); + void handleCondMovFP(MachineBasicBlock::iterator &I); + void handleSpecialFP(MachineBasicBlock::iterator &I); + + // Check if a COPY instruction is using FP registers. + static bool isFPCopy(MachineInstr *MI) { + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned SrcReg = MI->getOperand(1).getReg(); + + return X86::RFP80RegClass.contains(DstReg) || + X86::RFP80RegClass.contains(SrcReg); + } + }; + char FPS::ID = 0; +} + +FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); } + +/// getFPReg - Return the X86::FPx register number for the specified operand. +/// For example, this returns 3 for X86::FP3. +static unsigned getFPReg(const MachineOperand &MO) { + assert(MO.isReg() && "Expected an FP register!"); + unsigned Reg = MO.getReg(); + assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!"); + return Reg - X86::FP0; +} + +/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP +/// register references into FP stack references. +/// +bool FPS::runOnMachineFunction(MachineFunction &MF) { + // We only need to run this pass if there are any FP registers used in this + // function. If it is all integer, there is nothing for us to do! + bool FPIsUsed = false; + + assert(X86::FP6 == X86::FP0+6 && "Register enums aren't sorted right!"); + for (unsigned i = 0; i <= 6; ++i) + if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) { + FPIsUsed = true; + break; + } + + // Early exit. + if (!FPIsUsed) return false; + + Bundles = &getAnalysis<EdgeBundles>(); + TII = MF.getTarget().getInstrInfo(); + + // Prepare cross-MBB liveness. + bundleCFG(MF); + + StackTop = 0; + + // Process the function in depth first order so that we process at least one + // of the predecessors for every reachable block in the function. + SmallPtrSet<MachineBasicBlock*, 8> Processed; + MachineBasicBlock *Entry = MF.begin(); + + bool Changed = false; + for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*, 8> > + I = df_ext_begin(Entry, Processed), E = df_ext_end(Entry, Processed); + I != E; ++I) + Changed |= processBasicBlock(MF, **I); + + // Process any unreachable blocks in arbitrary order now. + if (MF.size() != Processed.size()) + for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) + if (Processed.insert(BB)) + Changed |= processBasicBlock(MF, *BB); + + LiveBundles.clear(); + + return Changed; +} + +/// bundleCFG - Scan all the basic blocks to determine consistent live-in and +/// live-out sets for the FP registers. Consistent means that the set of +/// registers live-out from a block is identical to the live-in set of all +/// successors. This is not enforced by the normal live-in lists since +/// registers may be implicitly defined, or not used by all successors. +void FPS::bundleCFG(MachineFunction &MF) { + assert(LiveBundles.empty() && "Stale data in LiveBundles"); + LiveBundles.resize(Bundles->getNumBundles()); + + // Gather the actual live-in masks for all MBBs. + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { + MachineBasicBlock *MBB = I; + const unsigned Mask = calcLiveInMask(MBB); + if (!Mask) + continue; + // Update MBB ingoing bundle mask. + LiveBundles[Bundles->getBundle(MBB->getNumber(), false)].Mask |= Mask; + } +} + +/// processBasicBlock - Loop over all of the instructions in the basic block, +/// transforming FP instructions into their stack form. +/// +bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { + bool Changed = false; + MBB = &BB; + NumPendingSTs = 0; + + setupBlockStack(); + + for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { + MachineInstr *MI = I; + uint64_t Flags = MI->getDesc().TSFlags; + + unsigned FPInstClass = Flags & X86II::FPTypeMask; + if (MI->isInlineAsm()) + FPInstClass = X86II::SpecialFP; + + if (MI->isCopy() && isFPCopy(MI)) + FPInstClass = X86II::SpecialFP; + + if (MI->isImplicitDef() && + X86::RFP80RegClass.contains(MI->getOperand(0).getReg())) + FPInstClass = X86II::SpecialFP; + + if (FPInstClass == X86II::NotFP) + continue; // Efficiently ignore non-fp insts! + + MachineInstr *PrevMI = 0; + if (I != BB.begin()) + PrevMI = prior(I); + + ++NumFP; // Keep track of # of pseudo instrs + DEBUG(dbgs() << "\nFPInst:\t" << *MI); + + // Get dead variables list now because the MI pointer may be deleted as part + // of processing! + SmallVector<unsigned, 8> DeadRegs; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg() && MO.isDead()) + DeadRegs.push_back(MO.getReg()); + } + + switch (FPInstClass) { + case X86II::ZeroArgFP: handleZeroArgFP(I); break; + case X86II::OneArgFP: handleOneArgFP(I); break; // fstp ST(0) + case X86II::OneArgFPRW: handleOneArgFPRW(I); break; // ST(0) = fsqrt(ST(0)) + case X86II::TwoArgFP: handleTwoArgFP(I); break; + case X86II::CompareFP: handleCompareFP(I); break; + case X86II::CondMovFP: handleCondMovFP(I); break; + case X86II::SpecialFP: handleSpecialFP(I); break; + default: llvm_unreachable("Unknown FP Type!"); + } + + // Check to see if any of the values defined by this instruction are dead + // after definition. If so, pop them. + for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) { + unsigned Reg = DeadRegs[i]; + if (Reg >= X86::FP0 && Reg <= X86::FP6) { + DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n"); + freeStackSlotAfter(I, Reg-X86::FP0); + } + } + + // Print out all of the instructions expanded to if -debug + DEBUG( + MachineBasicBlock::iterator PrevI(PrevMI); + if (I == PrevI) { + dbgs() << "Just deleted pseudo instruction\n"; + } else { + MachineBasicBlock::iterator Start = I; + // Rewind to first instruction newly inserted. + while (Start != BB.begin() && prior(Start) != PrevI) --Start; + dbgs() << "Inserted instructions:\n\t"; + Start->print(dbgs(), &MF.getTarget()); + while (++Start != llvm::next(I)) {} + } + dumpStack(); + ); + (void)PrevMI; + + Changed = true; + } + + finishBlockStack(); + + return Changed; +} + +/// setupBlockStack - Use the live bundles to set up our model of the stack +/// to match predecessors' live out stack. +void FPS::setupBlockStack() { + DEBUG(dbgs() << "\nSetting up live-ins for BB#" << MBB->getNumber() + << " derived from " << MBB->getName() << ".\n"); + StackTop = 0; + // Get the live-in bundle for MBB. + const LiveBundle &Bundle = + LiveBundles[Bundles->getBundle(MBB->getNumber(), false)]; + + if (!Bundle.Mask) { + DEBUG(dbgs() << "Block has no FP live-ins.\n"); + return; + } + + // Depth-first iteration should ensure that we always have an assigned stack. + assert(Bundle.isFixed() && "Reached block before any predecessors"); + + // Push the fixed live-in registers. + for (unsigned i = Bundle.FixCount; i > 0; --i) { + MBB->addLiveIn(X86::ST0+i-1); + DEBUG(dbgs() << "Live-in st(" << (i-1) << "): %FP" + << unsigned(Bundle.FixStack[i-1]) << '\n'); + pushReg(Bundle.FixStack[i-1]); + } + + // Kill off unwanted live-ins. This can happen with a critical edge. + // FIXME: We could keep these live registers around as zombies. They may need + // to be revived at the end of a short block. It might save a few instrs. + adjustLiveRegs(calcLiveInMask(MBB), MBB->begin()); + DEBUG(MBB->dump()); +} + +/// finishBlockStack - Revive live-outs that are implicitly defined out of +/// MBB. Shuffle live registers to match the expected fixed stack of any +/// predecessors, and ensure that all predecessors are expecting the same +/// stack. +void FPS::finishBlockStack() { + // The RET handling below takes care of return blocks for us. + if (MBB->succ_empty()) + return; + + DEBUG(dbgs() << "Setting up live-outs for BB#" << MBB->getNumber() + << " derived from " << MBB->getName() << ".\n"); + + // Get MBB's live-out bundle. + unsigned BundleIdx = Bundles->getBundle(MBB->getNumber(), true); + LiveBundle &Bundle = LiveBundles[BundleIdx]; + + // We may need to kill and define some registers to match successors. + // FIXME: This can probably be combined with the shuffle below. + MachineBasicBlock::iterator Term = MBB->getFirstTerminator(); + adjustLiveRegs(Bundle.Mask, Term); + + if (!Bundle.Mask) { + DEBUG(dbgs() << "No live-outs.\n"); + return; + } + + // Has the stack order been fixed yet? + DEBUG(dbgs() << "LB#" << BundleIdx << ": "); + if (Bundle.isFixed()) { + DEBUG(dbgs() << "Shuffling stack to match.\n"); + shuffleStackTop(Bundle.FixStack, Bundle.FixCount, Term); + } else { + // Not fixed yet, we get to choose. + DEBUG(dbgs() << "Fixing stack order now.\n"); + Bundle.FixCount = StackTop; + for (unsigned i = 0; i < StackTop; ++i) + Bundle.FixStack[i] = getStackEntry(i); + } +} + + +//===----------------------------------------------------------------------===// +// Efficient Lookup Table Support +//===----------------------------------------------------------------------===// + +namespace { + struct TableEntry { + uint16_t from; + uint16_t to; + bool operator<(const TableEntry &TE) const { return from < TE.from; } + friend bool operator<(const TableEntry &TE, unsigned V) { + return TE.from < V; + } + friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned V, + const TableEntry &TE) { + return V < TE.from; + } + }; +} + +#ifndef NDEBUG +static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) { + for (unsigned i = 0; i != NumEntries-1; ++i) + if (!(Table[i] < Table[i+1])) return false; + return true; +} +#endif + +static int Lookup(const TableEntry *Table, unsigned N, unsigned Opcode) { + const TableEntry *I = std::lower_bound(Table, Table+N, Opcode); + if (I != Table+N && I->from == Opcode) + return I->to; + return -1; +} + +#ifdef NDEBUG +#define ASSERT_SORTED(TABLE) +#else +#define ASSERT_SORTED(TABLE) \ + { static bool TABLE##Checked = false; \ + if (!TABLE##Checked) { \ + assert(TableIsSorted(TABLE, array_lengthof(TABLE)) && \ + "All lookup tables must be sorted for efficient access!"); \ + TABLE##Checked = true; \ + } \ + } +#endif + +//===----------------------------------------------------------------------===// +// Register File -> Register Stack Mapping Methods +//===----------------------------------------------------------------------===// + +// OpcodeTable - Sorted map of register instructions to their stack version. +// The first element is an register file pseudo instruction, the second is the +// concrete X86 instruction which uses the register stack. +// +static const TableEntry OpcodeTable[] = { + { X86::ABS_Fp32 , X86::ABS_F }, + { X86::ABS_Fp64 , X86::ABS_F }, + { X86::ABS_Fp80 , X86::ABS_F }, + { X86::ADD_Fp32m , X86::ADD_F32m }, + { X86::ADD_Fp64m , X86::ADD_F64m }, + { X86::ADD_Fp64m32 , X86::ADD_F32m }, + { X86::ADD_Fp80m32 , X86::ADD_F32m }, + { X86::ADD_Fp80m64 , X86::ADD_F64m }, + { X86::ADD_FpI16m32 , X86::ADD_FI16m }, + { X86::ADD_FpI16m64 , X86::ADD_FI16m }, + { X86::ADD_FpI16m80 , X86::ADD_FI16m }, + { X86::ADD_FpI32m32 , X86::ADD_FI32m }, + { X86::ADD_FpI32m64 , X86::ADD_FI32m }, + { X86::ADD_FpI32m80 , X86::ADD_FI32m }, + { X86::CHS_Fp32 , X86::CHS_F }, + { X86::CHS_Fp64 , X86::CHS_F }, + { X86::CHS_Fp80 , X86::CHS_F }, + { X86::CMOVBE_Fp32 , X86::CMOVBE_F }, + { X86::CMOVBE_Fp64 , X86::CMOVBE_F }, + { X86::CMOVBE_Fp80 , X86::CMOVBE_F }, + { X86::CMOVB_Fp32 , X86::CMOVB_F }, + { X86::CMOVB_Fp64 , X86::CMOVB_F }, + { X86::CMOVB_Fp80 , X86::CMOVB_F }, + { X86::CMOVE_Fp32 , X86::CMOVE_F }, + { X86::CMOVE_Fp64 , X86::CMOVE_F }, + { X86::CMOVE_Fp80 , X86::CMOVE_F }, + { X86::CMOVNBE_Fp32 , X86::CMOVNBE_F }, + { X86::CMOVNBE_Fp64 , X86::CMOVNBE_F }, + { X86::CMOVNBE_Fp80 , X86::CMOVNBE_F }, + { X86::CMOVNB_Fp32 , X86::CMOVNB_F }, + { X86::CMOVNB_Fp64 , X86::CMOVNB_F }, + { X86::CMOVNB_Fp80 , X86::CMOVNB_F }, + { X86::CMOVNE_Fp32 , X86::CMOVNE_F }, + { X86::CMOVNE_Fp64 , X86::CMOVNE_F }, + { X86::CMOVNE_Fp80 , X86::CMOVNE_F }, + { X86::CMOVNP_Fp32 , X86::CMOVNP_F }, + { X86::CMOVNP_Fp64 , X86::CMOVNP_F }, + { X86::CMOVNP_Fp80 , X86::CMOVNP_F }, + { X86::CMOVP_Fp32 , X86::CMOVP_F }, + { X86::CMOVP_Fp64 , X86::CMOVP_F }, + { X86::CMOVP_Fp80 , X86::CMOVP_F }, + { X86::COS_Fp32 , X86::COS_F }, + { X86::COS_Fp64 , X86::COS_F }, + { X86::COS_Fp80 , X86::COS_F }, + { X86::DIVR_Fp32m , X86::DIVR_F32m }, + { X86::DIVR_Fp64m , X86::DIVR_F64m }, + { X86::DIVR_Fp64m32 , X86::DIVR_F32m }, + { X86::DIVR_Fp80m32 , X86::DIVR_F32m }, + { X86::DIVR_Fp80m64 , X86::DIVR_F64m }, + { X86::DIVR_FpI16m32, X86::DIVR_FI16m}, + { X86::DIVR_FpI16m64, X86::DIVR_FI16m}, + { X86::DIVR_FpI16m80, X86::DIVR_FI16m}, + { X86::DIVR_FpI32m32, X86::DIVR_FI32m}, + { X86::DIVR_FpI32m64, X86::DIVR_FI32m}, + { X86::DIVR_FpI32m80, X86::DIVR_FI32m}, + { X86::DIV_Fp32m , X86::DIV_F32m }, + { X86::DIV_Fp64m , X86::DIV_F64m }, + { X86::DIV_Fp64m32 , X86::DIV_F32m }, + { X86::DIV_Fp80m32 , X86::DIV_F32m }, + { X86::DIV_Fp80m64 , X86::DIV_F64m }, + { X86::DIV_FpI16m32 , X86::DIV_FI16m }, + { X86::DIV_FpI16m64 , X86::DIV_FI16m }, + { X86::DIV_FpI16m80 , X86::DIV_FI16m }, + { X86::DIV_FpI32m32 , X86::DIV_FI32m }, + { X86::DIV_FpI32m64 , X86::DIV_FI32m }, + { X86::DIV_FpI32m80 , X86::DIV_FI32m }, + { X86::ILD_Fp16m32 , X86::ILD_F16m }, + { X86::ILD_Fp16m64 , X86::ILD_F16m }, + { X86::ILD_Fp16m80 , X86::ILD_F16m }, + { X86::ILD_Fp32m32 , X86::ILD_F32m }, + { X86::ILD_Fp32m64 , X86::ILD_F32m }, + { X86::ILD_Fp32m80 , X86::ILD_F32m }, + { X86::ILD_Fp64m32 , X86::ILD_F64m }, + { X86::ILD_Fp64m64 , X86::ILD_F64m }, + { X86::ILD_Fp64m80 , X86::ILD_F64m }, + { X86::ISTT_Fp16m32 , X86::ISTT_FP16m}, + { X86::ISTT_Fp16m64 , X86::ISTT_FP16m}, + { X86::ISTT_Fp16m80 , X86::ISTT_FP16m}, + { X86::ISTT_Fp32m32 , X86::ISTT_FP32m}, + { X86::ISTT_Fp32m64 , X86::ISTT_FP32m}, + { X86::ISTT_Fp32m80 , X86::ISTT_FP32m}, + { X86::ISTT_Fp64m32 , X86::ISTT_FP64m}, + { X86::ISTT_Fp64m64 , X86::ISTT_FP64m}, + { X86::ISTT_Fp64m80 , X86::ISTT_FP64m}, + { X86::IST_Fp16m32 , X86::IST_F16m }, + { X86::IST_Fp16m64 , X86::IST_F16m }, + { X86::IST_Fp16m80 , X86::IST_F16m }, + { X86::IST_Fp32m32 , X86::IST_F32m }, + { X86::IST_Fp32m64 , X86::IST_F32m }, + { X86::IST_Fp32m80 , X86::IST_F32m }, + { X86::IST_Fp64m32 , X86::IST_FP64m }, + { X86::IST_Fp64m64 , X86::IST_FP64m }, + { X86::IST_Fp64m80 , X86::IST_FP64m }, + { X86::LD_Fp032 , X86::LD_F0 }, + { X86::LD_Fp064 , X86::LD_F0 }, + { X86::LD_Fp080 , X86::LD_F0 }, + { X86::LD_Fp132 , X86::LD_F1 }, + { X86::LD_Fp164 , X86::LD_F1 }, + { X86::LD_Fp180 , X86::LD_F1 }, + { X86::LD_Fp32m , X86::LD_F32m }, + { X86::LD_Fp32m64 , X86::LD_F32m }, + { X86::LD_Fp32m80 , X86::LD_F32m }, + { X86::LD_Fp64m , X86::LD_F64m }, + { X86::LD_Fp64m80 , X86::LD_F64m }, + { X86::LD_Fp80m , X86::LD_F80m }, + { X86::MUL_Fp32m , X86::MUL_F32m }, + { X86::MUL_Fp64m , X86::MUL_F64m }, + { X86::MUL_Fp64m32 , X86::MUL_F32m }, + { X86::MUL_Fp80m32 , X86::MUL_F32m }, + { X86::MUL_Fp80m64 , X86::MUL_F64m }, + { X86::MUL_FpI16m32 , X86::MUL_FI16m }, + { X86::MUL_FpI16m64 , X86::MUL_FI16m }, + { X86::MUL_FpI16m80 , X86::MUL_FI16m }, + { X86::MUL_FpI32m32 , X86::MUL_FI32m }, + { X86::MUL_FpI32m64 , X86::MUL_FI32m }, + { X86::MUL_FpI32m80 , X86::MUL_FI32m }, + { X86::SIN_Fp32 , X86::SIN_F }, + { X86::SIN_Fp64 , X86::SIN_F }, + { X86::SIN_Fp80 , X86::SIN_F }, + { X86::SQRT_Fp32 , X86::SQRT_F }, + { X86::SQRT_Fp64 , X86::SQRT_F }, + { X86::SQRT_Fp80 , X86::SQRT_F }, + { X86::ST_Fp32m , X86::ST_F32m }, + { X86::ST_Fp64m , X86::ST_F64m }, + { X86::ST_Fp64m32 , X86::ST_F32m }, + { X86::ST_Fp80m32 , X86::ST_F32m }, + { X86::ST_Fp80m64 , X86::ST_F64m }, + { X86::ST_FpP80m , X86::ST_FP80m }, + { X86::SUBR_Fp32m , X86::SUBR_F32m }, + { X86::SUBR_Fp64m , X86::SUBR_F64m }, + { X86::SUBR_Fp64m32 , X86::SUBR_F32m }, + { X86::SUBR_Fp80m32 , X86::SUBR_F32m }, + { X86::SUBR_Fp80m64 , X86::SUBR_F64m }, + { X86::SUBR_FpI16m32, X86::SUBR_FI16m}, + { X86::SUBR_FpI16m64, X86::SUBR_FI16m}, + { X86::SUBR_FpI16m80, X86::SUBR_FI16m}, + { X86::SUBR_FpI32m32, X86::SUBR_FI32m}, + { X86::SUBR_FpI32m64, X86::SUBR_FI32m}, + { X86::SUBR_FpI32m80, X86::SUBR_FI32m}, + { X86::SUB_Fp32m , X86::SUB_F32m }, + { X86::SUB_Fp64m , X86::SUB_F64m }, + { X86::SUB_Fp64m32 , X86::SUB_F32m }, + { X86::SUB_Fp80m32 , X86::SUB_F32m }, + { X86::SUB_Fp80m64 , X86::SUB_F64m }, + { X86::SUB_FpI16m32 , X86::SUB_FI16m }, + { X86::SUB_FpI16m64 , X86::SUB_FI16m }, + { X86::SUB_FpI16m80 , X86::SUB_FI16m }, + { X86::SUB_FpI32m32 , X86::SUB_FI32m }, + { X86::SUB_FpI32m64 , X86::SUB_FI32m }, + { X86::SUB_FpI32m80 , X86::SUB_FI32m }, + { X86::TST_Fp32 , X86::TST_F }, + { X86::TST_Fp64 , X86::TST_F }, + { X86::TST_Fp80 , X86::TST_F }, + { X86::UCOM_FpIr32 , X86::UCOM_FIr }, + { X86::UCOM_FpIr64 , X86::UCOM_FIr }, + { X86::UCOM_FpIr80 , X86::UCOM_FIr }, + { X86::UCOM_Fpr32 , X86::UCOM_Fr }, + { X86::UCOM_Fpr64 , X86::UCOM_Fr }, + { X86::UCOM_Fpr80 , X86::UCOM_Fr }, +}; + +static unsigned getConcreteOpcode(unsigned Opcode) { + ASSERT_SORTED(OpcodeTable); + int Opc = Lookup(OpcodeTable, array_lengthof(OpcodeTable), Opcode); + assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!"); + return Opc; +} + +//===----------------------------------------------------------------------===// +// Helper Methods +//===----------------------------------------------------------------------===// + +// PopTable - Sorted map of instructions to their popping version. The first +// element is an instruction, the second is the version which pops. +// +static const TableEntry PopTable[] = { + { X86::ADD_FrST0 , X86::ADD_FPrST0 }, + + { X86::DIVR_FrST0, X86::DIVR_FPrST0 }, + { X86::DIV_FrST0 , X86::DIV_FPrST0 }, + + { X86::IST_F16m , X86::IST_FP16m }, + { X86::IST_F32m , X86::IST_FP32m }, + + { X86::MUL_FrST0 , X86::MUL_FPrST0 }, + + { X86::ST_F32m , X86::ST_FP32m }, + { X86::ST_F64m , X86::ST_FP64m }, + { X86::ST_Frr , X86::ST_FPrr }, + + { X86::SUBR_FrST0, X86::SUBR_FPrST0 }, + { X86::SUB_FrST0 , X86::SUB_FPrST0 }, + + { X86::UCOM_FIr , X86::UCOM_FIPr }, + + { X86::UCOM_FPr , X86::UCOM_FPPr }, + { X86::UCOM_Fr , X86::UCOM_FPr }, +}; + +/// popStackAfter - Pop the current value off of the top of the FP stack after +/// the specified instruction. This attempts to be sneaky and combine the pop +/// into the instruction itself if possible. The iterator is left pointing to +/// the last instruction, be it a new pop instruction inserted, or the old +/// instruction if it was modified in place. +/// +void FPS::popStackAfter(MachineBasicBlock::iterator &I) { + MachineInstr* MI = I; + DebugLoc dl = MI->getDebugLoc(); + ASSERT_SORTED(PopTable); + if (StackTop == 0) + report_fatal_error("Cannot pop empty stack!"); + RegMap[Stack[--StackTop]] = ~0; // Update state + + // Check to see if there is a popping version of this instruction... + int Opcode = Lookup(PopTable, array_lengthof(PopTable), I->getOpcode()); + if (Opcode != -1) { + I->setDesc(TII->get(Opcode)); + if (Opcode == X86::UCOM_FPPr) + I->RemoveOperand(0); + } else { // Insert an explicit pop + I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(X86::ST0); + } +} + +/// freeStackSlotAfter - Free the specified register from the register stack, so +/// that it is no longer in a register. If the register is currently at the top +/// of the stack, we just pop the current instruction, otherwise we store the +/// current top-of-stack into the specified slot, then pop the top of stack. +void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) { + if (getStackEntry(0) == FPRegNo) { // already at the top of stack? easy. + popStackAfter(I); + return; + } + + // Otherwise, store the top of stack into the dead slot, killing the operand + // without having to add in an explicit xchg then pop. + // + I = freeStackSlotBefore(++I, FPRegNo); +} + +/// freeStackSlotBefore - Free the specified register without trying any +/// folding. +MachineBasicBlock::iterator +FPS::freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo) { + unsigned STReg = getSTReg(FPRegNo); + unsigned OldSlot = getSlot(FPRegNo); + unsigned TopReg = Stack[StackTop-1]; + Stack[OldSlot] = TopReg; + RegMap[TopReg] = OldSlot; + RegMap[FPRegNo] = ~0; + Stack[--StackTop] = ~0; + return BuildMI(*MBB, I, DebugLoc(), TII->get(X86::ST_FPrr)).addReg(STReg); +} + +/// adjustLiveRegs - Kill and revive registers such that exactly the FP +/// registers with a bit in Mask are live. +void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) { + unsigned Defs = Mask; + unsigned Kills = 0; + for (unsigned i = 0; i < StackTop; ++i) { + unsigned RegNo = Stack[i]; + if (!(Defs & (1 << RegNo))) + // This register is live, but we don't want it. + Kills |= (1 << RegNo); + else + // We don't need to imp-def this live register. + Defs &= ~(1 << RegNo); + } + assert((Kills & Defs) == 0 && "Register needs killing and def'ing?"); + + // Produce implicit-defs for free by using killed registers. + while (Kills && Defs) { + unsigned KReg = CountTrailingZeros_32(Kills); + unsigned DReg = CountTrailingZeros_32(Defs); + DEBUG(dbgs() << "Renaming %FP" << KReg << " as imp %FP" << DReg << "\n"); + std::swap(Stack[getSlot(KReg)], Stack[getSlot(DReg)]); + std::swap(RegMap[KReg], RegMap[DReg]); + Kills &= ~(1 << KReg); + Defs &= ~(1 << DReg); + } + + // Kill registers by popping. + if (Kills && I != MBB->begin()) { + MachineBasicBlock::iterator I2 = llvm::prior(I); + while (StackTop) { + unsigned KReg = getStackEntry(0); + if (!(Kills & (1 << KReg))) + break; + DEBUG(dbgs() << "Popping %FP" << KReg << "\n"); + popStackAfter(I2); + Kills &= ~(1 << KReg); + } + } + + // Manually kill the rest. + while (Kills) { + unsigned KReg = CountTrailingZeros_32(Kills); + DEBUG(dbgs() << "Killing %FP" << KReg << "\n"); + freeStackSlotBefore(I, KReg); + Kills &= ~(1 << KReg); + } + + // Load zeros for all the imp-defs. + while(Defs) { + unsigned DReg = CountTrailingZeros_32(Defs); + DEBUG(dbgs() << "Defining %FP" << DReg << " as 0\n"); + BuildMI(*MBB, I, DebugLoc(), TII->get(X86::LD_F0)); + pushReg(DReg); + Defs &= ~(1 << DReg); + } + + // Now we should have the correct registers live. + DEBUG(dumpStack()); + assert(StackTop == CountPopulation_32(Mask) && "Live count mismatch"); +} + +/// shuffleStackTop - emit fxch instructions before I to shuffle the top +/// FixCount entries into the order given by FixStack. +/// FIXME: Is there a better algorithm than insertion sort? +void FPS::shuffleStackTop(const unsigned char *FixStack, + unsigned FixCount, + MachineBasicBlock::iterator I) { + // Move items into place, starting from the desired stack bottom. + while (FixCount--) { + // Old register at position FixCount. + unsigned OldReg = getStackEntry(FixCount); + // Desired register at position FixCount. + unsigned Reg = FixStack[FixCount]; + if (Reg == OldReg) + continue; + // (Reg st0) (OldReg st0) = (Reg OldReg st0) + moveToTop(Reg, I); + if (FixCount > 0) + moveToTop(OldReg, I); + } + DEBUG(dumpStack()); +} + + +//===----------------------------------------------------------------------===// +// Instruction transformation implementation +//===----------------------------------------------------------------------===// + +/// handleZeroArgFP - ST(0) = fld0 ST(0) = flds <mem> +/// +void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) { + MachineInstr *MI = I; + unsigned DestReg = getFPReg(MI->getOperand(0)); + + // Change from the pseudo instruction to the concrete instruction. + MI->RemoveOperand(0); // Remove the explicit ST(0) operand + MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode()))); + + // Result gets pushed on the stack. + pushReg(DestReg); +} + +/// handleOneArgFP - fst <mem>, ST(0) +/// +void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) { + MachineInstr *MI = I; + unsigned NumOps = MI->getDesc().getNumOperands(); + assert((NumOps == X86::AddrNumOperands + 1 || NumOps == 1) && + "Can only handle fst* & ftst instructions!"); + + // Is this the last use of the source register? + unsigned Reg = getFPReg(MI->getOperand(NumOps-1)); + bool KillsSrc = MI->killsRegister(X86::FP0+Reg); + + if (KillsSrc) + duplicatePendingSTBeforeKill(Reg, I); + + // FISTP64m is strange because there isn't a non-popping versions. + // If we have one _and_ we don't want to pop the operand, duplicate the value + // on the stack instead of moving it. This ensure that popping the value is + // always ok. + // Ditto FISTTP16m, FISTTP32m, FISTTP64m, ST_FpP80m. + // + if (!KillsSrc && + (MI->getOpcode() == X86::IST_Fp64m32 || + MI->getOpcode() == X86::ISTT_Fp16m32 || + MI->getOpcode() == X86::ISTT_Fp32m32 || + MI->getOpcode() == X86::ISTT_Fp64m32 || + MI->getOpcode() == X86::IST_Fp64m64 || + MI->getOpcode() == X86::ISTT_Fp16m64 || + MI->getOpcode() == X86::ISTT_Fp32m64 || + MI->getOpcode() == X86::ISTT_Fp64m64 || + MI->getOpcode() == X86::IST_Fp64m80 || + MI->getOpcode() == X86::ISTT_Fp16m80 || + MI->getOpcode() == X86::ISTT_Fp32m80 || + MI->getOpcode() == X86::ISTT_Fp64m80 || + MI->getOpcode() == X86::ST_FpP80m)) { + duplicateToTop(Reg, getScratchReg(), I); + } else { + moveToTop(Reg, I); // Move to the top of the stack... + } + + // Convert from the pseudo instruction to the concrete instruction. + MI->RemoveOperand(NumOps-1); // Remove explicit ST(0) operand + MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode()))); + + if (MI->getOpcode() == X86::IST_FP64m || + MI->getOpcode() == X86::ISTT_FP16m || + MI->getOpcode() == X86::ISTT_FP32m || + MI->getOpcode() == X86::ISTT_FP64m || + MI->getOpcode() == X86::ST_FP80m) { + if (StackTop == 0) + report_fatal_error("Stack empty??"); + --StackTop; + } else if (KillsSrc) { // Last use of operand? + popStackAfter(I); + } +} + + +/// handleOneArgFPRW: Handle instructions that read from the top of stack and +/// replace the value with a newly computed value. These instructions may have +/// non-fp operands after their FP operands. +/// +/// Examples: +/// R1 = fchs R2 +/// R1 = fadd R2, [mem] +/// +void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) { + MachineInstr *MI = I; +#ifndef NDEBUG + unsigned NumOps = MI->getDesc().getNumOperands(); + assert(NumOps >= 2 && "FPRW instructions must have 2 ops!!"); +#endif + + // Is this the last use of the source register? + unsigned Reg = getFPReg(MI->getOperand(1)); + bool KillsSrc = MI->killsRegister(X86::FP0+Reg); + + if (KillsSrc) { + duplicatePendingSTBeforeKill(Reg, I); + // If this is the last use of the source register, just make sure it's on + // the top of the stack. + moveToTop(Reg, I); + if (StackTop == 0) + report_fatal_error("Stack cannot be empty!"); + --StackTop; + pushReg(getFPReg(MI->getOperand(0))); + } else { + // If this is not the last use of the source register, _copy_ it to the top + // of the stack. + duplicateToTop(Reg, getFPReg(MI->getOperand(0)), I); + } + + // Change from the pseudo instruction to the concrete instruction. + MI->RemoveOperand(1); // Drop the source operand. + MI->RemoveOperand(0); // Drop the destination operand. + MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode()))); +} + + +//===----------------------------------------------------------------------===// +// Define tables of various ways to map pseudo instructions +// + +// ForwardST0Table - Map: A = B op C into: ST(0) = ST(0) op ST(i) +static const TableEntry ForwardST0Table[] = { + { X86::ADD_Fp32 , X86::ADD_FST0r }, + { X86::ADD_Fp64 , X86::ADD_FST0r }, + { X86::ADD_Fp80 , X86::ADD_FST0r }, + { X86::DIV_Fp32 , X86::DIV_FST0r }, + { X86::DIV_Fp64 , X86::DIV_FST0r }, + { X86::DIV_Fp80 , X86::DIV_FST0r }, + { X86::MUL_Fp32 , X86::MUL_FST0r }, + { X86::MUL_Fp64 , X86::MUL_FST0r }, + { X86::MUL_Fp80 , X86::MUL_FST0r }, + { X86::SUB_Fp32 , X86::SUB_FST0r }, + { X86::SUB_Fp64 , X86::SUB_FST0r }, + { X86::SUB_Fp80 , X86::SUB_FST0r }, +}; + +// ReverseST0Table - Map: A = B op C into: ST(0) = ST(i) op ST(0) +static const TableEntry ReverseST0Table[] = { + { X86::ADD_Fp32 , X86::ADD_FST0r }, // commutative + { X86::ADD_Fp64 , X86::ADD_FST0r }, // commutative + { X86::ADD_Fp80 , X86::ADD_FST0r }, // commutative + { X86::DIV_Fp32 , X86::DIVR_FST0r }, + { X86::DIV_Fp64 , X86::DIVR_FST0r }, + { X86::DIV_Fp80 , X86::DIVR_FST0r }, + { X86::MUL_Fp32 , X86::MUL_FST0r }, // commutative + { X86::MUL_Fp64 , X86::MUL_FST0r }, // commutative + { X86::MUL_Fp80 , X86::MUL_FST0r }, // commutative + { X86::SUB_Fp32 , X86::SUBR_FST0r }, + { X86::SUB_Fp64 , X86::SUBR_FST0r }, + { X86::SUB_Fp80 , X86::SUBR_FST0r }, +}; + +// ForwardSTiTable - Map: A = B op C into: ST(i) = ST(0) op ST(i) +static const TableEntry ForwardSTiTable[] = { + { X86::ADD_Fp32 , X86::ADD_FrST0 }, // commutative + { X86::ADD_Fp64 , X86::ADD_FrST0 }, // commutative + { X86::ADD_Fp80 , X86::ADD_FrST0 }, // commutative + { X86::DIV_Fp32 , X86::DIVR_FrST0 }, + { X86::DIV_Fp64 , X86::DIVR_FrST0 }, + { X86::DIV_Fp80 , X86::DIVR_FrST0 }, + { X86::MUL_Fp32 , X86::MUL_FrST0 }, // commutative + { X86::MUL_Fp64 , X86::MUL_FrST0 }, // commutative + { X86::MUL_Fp80 , X86::MUL_FrST0 }, // commutative + { X86::SUB_Fp32 , X86::SUBR_FrST0 }, + { X86::SUB_Fp64 , X86::SUBR_FrST0 }, + { X86::SUB_Fp80 , X86::SUBR_FrST0 }, +}; + +// ReverseSTiTable - Map: A = B op C into: ST(i) = ST(i) op ST(0) +static const TableEntry ReverseSTiTable[] = { + { X86::ADD_Fp32 , X86::ADD_FrST0 }, + { X86::ADD_Fp64 , X86::ADD_FrST0 }, + { X86::ADD_Fp80 , X86::ADD_FrST0 }, + { X86::DIV_Fp32 , X86::DIV_FrST0 }, + { X86::DIV_Fp64 , X86::DIV_FrST0 }, + { X86::DIV_Fp80 , X86::DIV_FrST0 }, + { X86::MUL_Fp32 , X86::MUL_FrST0 }, + { X86::MUL_Fp64 , X86::MUL_FrST0 }, + { X86::MUL_Fp80 , X86::MUL_FrST0 }, + { X86::SUB_Fp32 , X86::SUB_FrST0 }, + { X86::SUB_Fp64 , X86::SUB_FrST0 }, + { X86::SUB_Fp80 , X86::SUB_FrST0 }, +}; + + +/// handleTwoArgFP - Handle instructions like FADD and friends which are virtual +/// instructions which need to be simplified and possibly transformed. +/// +/// Result: ST(0) = fsub ST(0), ST(i) +/// ST(i) = fsub ST(0), ST(i) +/// ST(0) = fsubr ST(0), ST(i) +/// ST(i) = fsubr ST(0), ST(i) +/// +void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) { + ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table); + ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable); + MachineInstr *MI = I; + + unsigned NumOperands = MI->getDesc().getNumOperands(); + assert(NumOperands == 3 && "Illegal TwoArgFP instruction!"); + unsigned Dest = getFPReg(MI->getOperand(0)); + unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2)); + unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1)); + bool KillsOp0 = MI->killsRegister(X86::FP0+Op0); + bool KillsOp1 = MI->killsRegister(X86::FP0+Op1); + DebugLoc dl = MI->getDebugLoc(); + + unsigned TOS = getStackEntry(0); + + // One of our operands must be on the top of the stack. If neither is yet, we + // need to move one. + if (Op0 != TOS && Op1 != TOS) { // No operand at TOS? + // We can choose to move either operand to the top of the stack. If one of + // the operands is killed by this instruction, we want that one so that we + // can update right on top of the old version. + if (KillsOp0) { + moveToTop(Op0, I); // Move dead operand to TOS. + TOS = Op0; + } else if (KillsOp1) { + moveToTop(Op1, I); + TOS = Op1; + } else { + // All of the operands are live after this instruction executes, so we + // cannot update on top of any operand. Because of this, we must + // duplicate one of the stack elements to the top. It doesn't matter + // which one we pick. + // + duplicateToTop(Op0, Dest, I); + Op0 = TOS = Dest; + KillsOp0 = true; + } + } else if (!KillsOp0 && !KillsOp1) { + // If we DO have one of our operands at the top of the stack, but we don't + // have a dead operand, we must duplicate one of the operands to a new slot + // on the stack. + duplicateToTop(Op0, Dest, I); + Op0 = TOS = Dest; + KillsOp0 = true; + } + + // Now we know that one of our operands is on the top of the stack, and at + // least one of our operands is killed by this instruction. + assert((TOS == Op0 || TOS == Op1) && (KillsOp0 || KillsOp1) && + "Stack conditions not set up right!"); + + // We decide which form to use based on what is on the top of the stack, and + // which operand is killed by this instruction. + const TableEntry *InstTable; + bool isForward = TOS == Op0; + bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0); + if (updateST0) { + if (isForward) + InstTable = ForwardST0Table; + else + InstTable = ReverseST0Table; + } else { + if (isForward) + InstTable = ForwardSTiTable; + else + InstTable = ReverseSTiTable; + } + + int Opcode = Lookup(InstTable, array_lengthof(ForwardST0Table), + MI->getOpcode()); + assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!"); + + // NotTOS - The register which is not on the top of stack... + unsigned NotTOS = (TOS == Op0) ? Op1 : Op0; + + // Replace the old instruction with a new instruction + MBB->remove(I++); + I = BuildMI(*MBB, I, dl, TII->get(Opcode)).addReg(getSTReg(NotTOS)); + + // If both operands are killed, pop one off of the stack in addition to + // overwriting the other one. + if (KillsOp0 && KillsOp1 && Op0 != Op1) { + assert(!updateST0 && "Should have updated other operand!"); + popStackAfter(I); // Pop the top of stack + } + + // Update stack information so that we know the destination register is now on + // the stack. + unsigned UpdatedSlot = getSlot(updateST0 ? TOS : NotTOS); + assert(UpdatedSlot < StackTop && Dest < 7); + Stack[UpdatedSlot] = Dest; + RegMap[Dest] = UpdatedSlot; + MBB->getParent()->DeleteMachineInstr(MI); // Remove the old instruction +} + +/// handleCompareFP - Handle FUCOM and FUCOMI instructions, which have two FP +/// register arguments and no explicit destinations. +/// +void FPS::handleCompareFP(MachineBasicBlock::iterator &I) { + ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table); + ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable); + MachineInstr *MI = I; + + unsigned NumOperands = MI->getDesc().getNumOperands(); + assert(NumOperands == 2 && "Illegal FUCOM* instruction!"); + unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2)); + unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1)); + bool KillsOp0 = MI->killsRegister(X86::FP0+Op0); + bool KillsOp1 = MI->killsRegister(X86::FP0+Op1); + + // Make sure the first operand is on the top of stack, the other one can be + // anywhere. + moveToTop(Op0, I); + + // Change from the pseudo instruction to the concrete instruction. + MI->getOperand(0).setReg(getSTReg(Op1)); + MI->RemoveOperand(1); + MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode()))); + + // If any of the operands are killed by this instruction, free them. + if (KillsOp0) freeStackSlotAfter(I, Op0); + if (KillsOp1 && Op0 != Op1) freeStackSlotAfter(I, Op1); +} + +/// handleCondMovFP - Handle two address conditional move instructions. These +/// instructions move a st(i) register to st(0) iff a condition is true. These +/// instructions require that the first operand is at the top of the stack, but +/// otherwise don't modify the stack at all. +void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) { + MachineInstr *MI = I; + + unsigned Op0 = getFPReg(MI->getOperand(0)); + unsigned Op1 = getFPReg(MI->getOperand(2)); + bool KillsOp1 = MI->killsRegister(X86::FP0+Op1); + + // The first operand *must* be on the top of the stack. + moveToTop(Op0, I); + + // Change the second operand to the stack register that the operand is in. + // Change from the pseudo instruction to the concrete instruction. + MI->RemoveOperand(0); + MI->RemoveOperand(1); + MI->getOperand(0).setReg(getSTReg(Op1)); + MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode()))); + + // If we kill the second operand, make sure to pop it from the stack. + if (Op0 != Op1 && KillsOp1) { + // Get this value off of the register stack. + freeStackSlotAfter(I, Op1); + } +} + + +/// handleSpecialFP - Handle special instructions which behave unlike other +/// floating point instructions. This is primarily intended for use by pseudo +/// instructions. +/// +void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { + MachineInstr *MI = I; + switch (MI->getOpcode()) { + default: llvm_unreachable("Unknown SpecialFP instruction!"); + case TargetOpcode::COPY: { + // We handle three kinds of copies: FP <- FP, FP <- ST, and ST <- FP. + const MachineOperand &MO1 = MI->getOperand(1); + const MachineOperand &MO0 = MI->getOperand(0); + unsigned DstST = MO0.getReg() - X86::ST0; + unsigned SrcST = MO1.getReg() - X86::ST0; + bool KillsSrc = MI->killsRegister(MO1.getReg()); + + // ST = COPY FP. Set up a pending ST register. + if (DstST < 8) { + unsigned SrcFP = getFPReg(MO1); + assert(isLive(SrcFP) && "Cannot copy dead register"); + assert(!MO0.isDead() && "Cannot copy to dead ST register"); + + // Unallocated STs are marked as the nonexistent FP255. + while (NumPendingSTs <= DstST) + PendingST[NumPendingSTs++] = NumFPRegs; + + // STi could still be live from a previous inline asm. + if (isScratchReg(PendingST[DstST])) { + DEBUG(dbgs() << "Clobbering old ST in FP" << unsigned(PendingST[DstST]) + << '\n'); + freeStackSlotBefore(MI, PendingST[DstST]); + } + + // When the source is killed, allocate a scratch FP register. + if (KillsSrc) { + duplicatePendingSTBeforeKill(SrcFP, I); + unsigned Slot = getSlot(SrcFP); + unsigned SR = getScratchReg(); + PendingST[DstST] = SR; + Stack[Slot] = SR; + RegMap[SR] = Slot; + } else + PendingST[DstST] = SrcFP; + break; + } + + // FP = COPY ST. Extract fixed stack value. + // Any instruction defining ST registers must have assigned them to a + // scratch register. + if (SrcST < 8) { + unsigned DstFP = getFPReg(MO0); + assert(!isLive(DstFP) && "Cannot copy ST to live FP register"); + assert(NumPendingSTs > SrcST && "Cannot copy from dead ST register"); + unsigned SrcFP = PendingST[SrcST]; + assert(isScratchReg(SrcFP) && "Expected ST in a scratch register"); + assert(isLive(SrcFP) && "Scratch holding ST is dead"); + + // DstFP steals the stack slot from SrcFP. + unsigned Slot = getSlot(SrcFP); + Stack[Slot] = DstFP; + RegMap[DstFP] = Slot; + + // Always treat the ST as killed. + PendingST[SrcST] = NumFPRegs; + while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs) + --NumPendingSTs; + break; + } + + // FP <- FP copy. + unsigned DstFP = getFPReg(MO0); + unsigned SrcFP = getFPReg(MO1); + assert(isLive(SrcFP) && "Cannot copy dead register"); + if (KillsSrc) { + // If the input operand is killed, we can just change the owner of the + // incoming stack slot into the result. + unsigned Slot = getSlot(SrcFP); + Stack[Slot] = DstFP; + RegMap[DstFP] = Slot; + } else { + // For COPY we just duplicate the specified value to a new stack slot. + // This could be made better, but would require substantial changes. + duplicateToTop(SrcFP, DstFP, I); + } + break; + } + + case TargetOpcode::IMPLICIT_DEF: { + // All FP registers must be explicitly defined, so load a 0 instead. + unsigned Reg = MI->getOperand(0).getReg() - X86::FP0; + DEBUG(dbgs() << "Emitting LD_F0 for implicit FP" << Reg << '\n'); + BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::LD_F0)); + pushReg(Reg); + break; + } + + case X86::FpPOP_RETVAL: { + // The FpPOP_RETVAL instruction is used after calls that return a value on + // the floating point stack. We cannot model this with ST defs since CALL + // instructions have fixed clobber lists. This instruction is interpreted + // to mean that there is one more live register on the stack than we + // thought. + // + // This means that StackTop does not match the hardware stack between a + // call and the FpPOP_RETVAL instructions. We do tolerate FP instructions + // between CALL and FpPOP_RETVAL as long as they don't overflow the + // hardware stack. + unsigned DstFP = getFPReg(MI->getOperand(0)); + + // Move existing stack elements up to reflect reality. + assert(StackTop < 8 && "Stack overflowed before FpPOP_RETVAL"); + if (StackTop) { + std::copy_backward(Stack, Stack + StackTop, Stack + StackTop + 1); + for (unsigned i = 0; i != NumFPRegs; ++i) + ++RegMap[i]; + } + ++StackTop; + + // DstFP is the new bottom of the stack. + Stack[0] = DstFP; + RegMap[DstFP] = 0; + + // DstFP will be killed by processBasicBlock if this was a dead def. + break; + } + + case TargetOpcode::INLINEASM: { + // The inline asm MachineInstr currently only *uses* FP registers for the + // 'f' constraint. These should be turned into the current ST(x) register + // in the machine instr. + // + // There are special rules for x87 inline assembly. The compiler must know + // exactly how many registers are popped and pushed implicitly by the asm. + // Otherwise it is not possible to restore the stack state after the inline + // asm. + // + // There are 3 kinds of input operands: + // + // 1. Popped inputs. These must appear at the stack top in ST0-STn. A + // popped input operand must be in a fixed stack slot, and it is either + // tied to an output operand, or in the clobber list. The MI has ST use + // and def operands for these inputs. + // + // 2. Fixed inputs. These inputs appear in fixed stack slots, but are + // preserved by the inline asm. The fixed stack slots must be STn-STm + // following the popped inputs. A fixed input operand cannot be tied to + // an output or appear in the clobber list. The MI has ST use operands + // and no defs for these inputs. + // + // 3. Preserved inputs. These inputs use the "f" constraint which is + // represented as an FP register. The inline asm won't change these + // stack slots. + // + // Outputs must be in ST registers, FP outputs are not allowed. Clobbered + // registers do not count as output operands. The inline asm changes the + // stack as if it popped all the popped inputs and then pushed all the + // output operands. + + // Scan the assembly for ST registers used, defined and clobbered. We can + // only tell clobbers from defs by looking at the asm descriptor. + unsigned STUses = 0, STDefs = 0, STClobbers = 0, STDeadDefs = 0; + unsigned NumOps = 0; + for (unsigned i = InlineAsm::MIOp_FirstOperand, e = MI->getNumOperands(); + i != e && MI->getOperand(i).isImm(); i += 1 + NumOps) { + unsigned Flags = MI->getOperand(i).getImm(); + NumOps = InlineAsm::getNumOperandRegisters(Flags); + if (NumOps != 1) + continue; + const MachineOperand &MO = MI->getOperand(i + 1); + if (!MO.isReg()) + continue; + unsigned STReg = MO.getReg() - X86::ST0; + if (STReg >= 8) + continue; + + switch (InlineAsm::getKind(Flags)) { + case InlineAsm::Kind_RegUse: + STUses |= (1u << STReg); + break; + case InlineAsm::Kind_RegDef: + case InlineAsm::Kind_RegDefEarlyClobber: + STDefs |= (1u << STReg); + if (MO.isDead()) + STDeadDefs |= (1u << STReg); + break; + case InlineAsm::Kind_Clobber: + STClobbers |= (1u << STReg); + break; + default: + break; + } + } + + if (STUses && !isMask_32(STUses)) + MI->emitError("fixed input regs must be last on the x87 stack"); + unsigned NumSTUses = CountTrailingOnes_32(STUses); + + // Defs must be contiguous from the stack top. ST0-STn. + if (STDefs && !isMask_32(STDefs)) { + MI->emitError("output regs must be last on the x87 stack"); + STDefs = NextPowerOf2(STDefs) - 1; + } + unsigned NumSTDefs = CountTrailingOnes_32(STDefs); + + // So must the clobbered stack slots. ST0-STm, m >= n. + if (STClobbers && !isMask_32(STDefs | STClobbers)) + MI->emitError("clobbers must be last on the x87 stack"); + + // Popped inputs are the ones that are also clobbered or defined. + unsigned STPopped = STUses & (STDefs | STClobbers); + if (STPopped && !isMask_32(STPopped)) + MI->emitError("implicitly popped regs must be last on the x87 stack"); + unsigned NumSTPopped = CountTrailingOnes_32(STPopped); + + DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops " + << NumSTPopped << ", and defines " << NumSTDefs << " regs.\n"); + + // Scan the instruction for FP uses corresponding to "f" constraints. + // Collect FP registers to kill afer the instruction. + // Always kill all the scratch regs. + unsigned FPKills = ((1u << NumFPRegs) - 1) & ~0xff; + unsigned FPUsed = 0; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &Op = MI->getOperand(i); + if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6) + continue; + if (!Op.isUse()) + MI->emitError("illegal \"f\" output constraint"); + unsigned FPReg = getFPReg(Op); + FPUsed |= 1U << FPReg; + + // If we kill this operand, make sure to pop it from the stack after the + // asm. We just remember it for now, and pop them all off at the end in + // a batch. + if (Op.isKill()) + FPKills |= 1U << FPReg; + } + + // The popped inputs will be killed by the instruction, so duplicate them + // if the FP register needs to be live after the instruction, or if it is + // used in the instruction itself. We effectively treat the popped inputs + // as early clobbers. + for (unsigned i = 0; i < NumSTPopped; ++i) { + if ((FPKills & ~FPUsed) & (1u << PendingST[i])) + continue; + unsigned SR = getScratchReg(); + duplicateToTop(PendingST[i], SR, I); + DEBUG(dbgs() << "Duplicating ST" << i << " in FP" + << unsigned(PendingST[i]) << " to avoid clobbering it.\n"); + PendingST[i] = SR; + } + + // Make sure we have a unique live register for every fixed use. Some of + // them could be undef uses, and we need to emit LD_F0 instructions. + for (unsigned i = 0; i < NumSTUses; ++i) { + if (i < NumPendingSTs && PendingST[i] < NumFPRegs) { + // Check for shared assignments. + for (unsigned j = 0; j < i; ++j) { + if (PendingST[j] != PendingST[i]) + continue; + // STi and STj are inn the same register, create a copy. + unsigned SR = getScratchReg(); + duplicateToTop(PendingST[i], SR, I); + DEBUG(dbgs() << "Duplicating ST" << i << " in FP" + << unsigned(PendingST[i]) + << " to avoid collision with ST" << j << '\n'); + PendingST[i] = SR; + } + continue; + } + unsigned SR = getScratchReg(); + DEBUG(dbgs() << "Emitting LD_F0 for ST" << i << " in FP" << SR << '\n'); + BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::LD_F0)); + pushReg(SR); + PendingST[i] = SR; + if (NumPendingSTs == i) + ++NumPendingSTs; + } + assert(NumPendingSTs >= NumSTUses && "Fixed registers should be assigned"); + + // Now we can rearrange the live registers to match what was requested. + shuffleStackTop(PendingST, NumPendingSTs, I); + DEBUG({dbgs() << "Before asm: "; dumpStack();}); + + // With the stack layout fixed, rewrite the FP registers. + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &Op = MI->getOperand(i); + if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6) + continue; + unsigned FPReg = getFPReg(Op); + Op.setReg(getSTReg(FPReg)); + } + + // Simulate the inline asm popping its inputs and pushing its outputs. + StackTop -= NumSTPopped; + + // Hold the fixed output registers in scratch FP registers. They will be + // transferred to real FP registers by copies. + NumPendingSTs = 0; + for (unsigned i = 0; i < NumSTDefs; ++i) { + unsigned SR = getScratchReg(); + pushReg(SR); + FPKills &= ~(1u << SR); + } + for (unsigned i = 0; i < NumSTDefs; ++i) + PendingST[NumPendingSTs++] = getStackEntry(i); + DEBUG({dbgs() << "After asm: "; dumpStack();}); + + // If any of the ST defs were dead, pop them immediately. Our caller only + // handles dead FP defs. + MachineBasicBlock::iterator InsertPt = MI; + for (unsigned i = 0; STDefs & (1u << i); ++i) { + if (!(STDeadDefs & (1u << i))) + continue; + freeStackSlotAfter(InsertPt, PendingST[i]); + PendingST[i] = NumFPRegs; + } + while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs) + --NumPendingSTs; + + // If this asm kills any FP registers (is the last use of them) we must + // explicitly emit pop instructions for them. Do this now after the asm has + // executed so that the ST(x) numbers are not off (which would happen if we + // did this inline with operand rewriting). + // + // Note: this might be a non-optimal pop sequence. We might be able to do + // better by trying to pop in stack order or something. + while (FPKills) { + unsigned FPReg = CountTrailingZeros_32(FPKills); + if (isLive(FPReg)) + freeStackSlotAfter(InsertPt, FPReg); + FPKills &= ~(1U << FPReg); + } + // Don't delete the inline asm! + return; + } + + case X86::WIN_FTOL_32: + case X86::WIN_FTOL_64: { + // Push the operand into ST0. + MachineOperand &Op = MI->getOperand(0); + assert(Op.isUse() && Op.isReg() && + Op.getReg() >= X86::FP0 && Op.getReg() <= X86::FP6); + unsigned FPReg = getFPReg(Op); + if (Op.isKill()) + moveToTop(FPReg, I); + else + duplicateToTop(FPReg, FPReg, I); + + // Emit the call. This will pop the operand. + BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::CALLpcrel32)) + .addExternalSymbol("_ftol2") + .addReg(X86::ST0, RegState::ImplicitKill) + .addReg(X86::EAX, RegState::Define | RegState::Implicit) + .addReg(X86::EDX, RegState::Define | RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); + --StackTop; + + break; + } + + case X86::RET: + case X86::RETI: + // If RET has an FP register use operand, pass the first one in ST(0) and + // the second one in ST(1). + + // Find the register operands. + unsigned FirstFPRegOp = ~0U, SecondFPRegOp = ~0U; + unsigned LiveMask = 0; + + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &Op = MI->getOperand(i); + if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6) + continue; + // FP Register uses must be kills unless there are two uses of the same + // register, in which case only one will be a kill. + assert(Op.isUse() && + (Op.isKill() || // Marked kill. + getFPReg(Op) == FirstFPRegOp || // Second instance. + MI->killsRegister(Op.getReg())) && // Later use is marked kill. + "Ret only defs operands, and values aren't live beyond it"); + + if (FirstFPRegOp == ~0U) + FirstFPRegOp = getFPReg(Op); + else { + assert(SecondFPRegOp == ~0U && "More than two fp operands!"); + SecondFPRegOp = getFPReg(Op); + } + LiveMask |= (1 << getFPReg(Op)); + + // Remove the operand so that later passes don't see it. + MI->RemoveOperand(i); + --i, --e; + } + + // We may have been carrying spurious live-ins, so make sure only the returned + // registers are left live. + adjustLiveRegs(LiveMask, MI); + if (!LiveMask) return; // Quick check to see if any are possible. + + // There are only four possibilities here: + // 1) we are returning a single FP value. In this case, it has to be in + // ST(0) already, so just declare success by removing the value from the + // FP Stack. + if (SecondFPRegOp == ~0U) { + // Assert that the top of stack contains the right FP register. + assert(StackTop == 1 && FirstFPRegOp == getStackEntry(0) && + "Top of stack not the right register for RET!"); + + // Ok, everything is good, mark the value as not being on the stack + // anymore so that our assertion about the stack being empty at end of + // block doesn't fire. + StackTop = 0; + return; + } + + // Otherwise, we are returning two values: + // 2) If returning the same value for both, we only have one thing in the FP + // stack. Consider: RET FP1, FP1 + if (StackTop == 1) { + assert(FirstFPRegOp == SecondFPRegOp && FirstFPRegOp == getStackEntry(0)&& + "Stack misconfiguration for RET!"); + + // Duplicate the TOS so that we return it twice. Just pick some other FPx + // register to hold it. + unsigned NewReg = getScratchReg(); + duplicateToTop(FirstFPRegOp, NewReg, MI); + FirstFPRegOp = NewReg; + } + + /// Okay we know we have two different FPx operands now: + assert(StackTop == 2 && "Must have two values live!"); + + /// 3) If SecondFPRegOp is currently in ST(0) and FirstFPRegOp is currently + /// in ST(1). In this case, emit an fxch. + if (getStackEntry(0) == SecondFPRegOp) { + assert(getStackEntry(1) == FirstFPRegOp && "Unknown regs live"); + moveToTop(FirstFPRegOp, MI); + } + + /// 4) Finally, FirstFPRegOp must be in ST(0) and SecondFPRegOp must be in + /// ST(1). Just remove both from our understanding of the stack and return. + assert(getStackEntry(0) == FirstFPRegOp && "Unknown regs live"); + assert(getStackEntry(1) == SecondFPRegOp && "Unknown regs live"); + StackTop = 0; + return; + } + + I = MBB->erase(I); // Remove the pseudo instruction + + // We want to leave I pointing to the previous instruction, but what if we + // just erased the first instruction? + if (I == MBB->begin()) { + DEBUG(dbgs() << "Inserting dummy KILL\n"); + I = BuildMI(*MBB, I, DebugLoc(), TII->get(TargetOpcode::KILL)); + } else + --I; +} diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp new file mode 100644 index 0000000..42b4e73 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -0,0 +1,1835 @@ +//===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of TargetFrameLowering class. +// +//===----------------------------------------------------------------------===// + +#include "X86FrameLowering.h" +#include "X86InstrBuilder.h" +#include "X86InstrInfo.h" +#include "X86MachineFunctionInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetOptions.h" + +using namespace llvm; + +// FIXME: completely move here. +extern cl::opt<bool> ForceStackAlign; + +bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { + return !MF.getFrameInfo()->hasVarSizedObjects(); +} + +/// hasFP - Return true if the specified function should have a dedicated frame +/// pointer register. This is true if the function has variable sized allocas +/// or if frame pointer elimination is disabled. +bool X86FrameLowering::hasFP(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineModuleInfo &MMI = MF.getMMI(); + const TargetRegisterInfo *RegInfo = TM.getRegisterInfo(); + + return (MF.getTarget().Options.DisableFramePointerElim(MF) || + RegInfo->needsStackRealignment(MF) || + MFI->hasVarSizedObjects() || + MFI->isFrameAddressTaken() || MF.hasMSInlineAsm() || + MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() || + MMI.callsUnwindInit() || MMI.callsEHReturn()); +} + +static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) { + if (IsLP64) { + if (isInt<8>(Imm)) + return X86::SUB64ri8; + return X86::SUB64ri32; + } else { + if (isInt<8>(Imm)) + return X86::SUB32ri8; + return X86::SUB32ri; + } +} + +static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) { + if (IsLP64) { + if (isInt<8>(Imm)) + return X86::ADD64ri8; + return X86::ADD64ri32; + } else { + if (isInt<8>(Imm)) + return X86::ADD32ri8; + return X86::ADD32ri; + } +} + +static unsigned getLEArOpcode(unsigned IsLP64) { + return IsLP64 ? X86::LEA64r : X86::LEA32r; +} + +/// findDeadCallerSavedReg - Return a caller-saved register that isn't live +/// when it reaches the "return" instruction. We can then pop a stack object +/// to this register without worry about clobbering it. +static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + const TargetRegisterInfo &TRI, + bool Is64Bit) { + const MachineFunction *MF = MBB.getParent(); + const Function *F = MF->getFunction(); + if (!F || MF->getMMI().callsEHReturn()) + return 0; + + static const uint16_t CallerSavedRegs32Bit[] = { + X86::EAX, X86::EDX, X86::ECX, 0 + }; + + static const uint16_t CallerSavedRegs64Bit[] = { + X86::RAX, X86::RDX, X86::RCX, X86::RSI, X86::RDI, + X86::R8, X86::R9, X86::R10, X86::R11, 0 + }; + + unsigned Opc = MBBI->getOpcode(); + switch (Opc) { + default: return 0; + case X86::RET: + case X86::RETI: + case X86::TCRETURNdi: + case X86::TCRETURNri: + case X86::TCRETURNmi: + case X86::TCRETURNdi64: + case X86::TCRETURNri64: + case X86::TCRETURNmi64: + case X86::EH_RETURN: + case X86::EH_RETURN64: { + SmallSet<uint16_t, 8> Uses; + for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MBBI->getOperand(i); + if (!MO.isReg() || MO.isDef()) + continue; + unsigned Reg = MO.getReg(); + if (!Reg) + continue; + for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI) + Uses.insert(*AI); + } + + const uint16_t *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit; + for (; *CS; ++CS) + if (!Uses.count(*CS)) + return *CS; + } + } + + return 0; +} + + +/// emitSPUpdate - Emit a series of instructions to increment / decrement the +/// stack pointer by a constant value. +static +void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + unsigned StackPtr, int64_t NumBytes, + bool Is64Bit, bool IsLP64, bool UseLEA, + const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) { + bool isSub = NumBytes < 0; + uint64_t Offset = isSub ? -NumBytes : NumBytes; + unsigned Opc; + if (UseLEA) + Opc = getLEArOpcode(IsLP64); + else + Opc = isSub + ? getSUBriOpcode(IsLP64, Offset) + : getADDriOpcode(IsLP64, Offset); + + uint64_t Chunk = (1LL << 31) - 1; + DebugLoc DL = MBB.findDebugLoc(MBBI); + + while (Offset) { + uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset; + if (ThisVal == (Is64Bit ? 8 : 4)) { + // Use push / pop instead. + unsigned Reg = isSub + ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX) + : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit); + if (Reg) { + Opc = isSub + ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r) + : (Is64Bit ? X86::POP64r : X86::POP32r); + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc)) + .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)); + if (isSub) + MI->setFlag(MachineInstr::FrameSetup); + Offset -= ThisVal; + continue; + } + } + + MachineInstr *MI = NULL; + + if (UseLEA) { + MI = addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), + StackPtr, false, isSub ? -ThisVal : ThisVal); + } else { + MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addImm(ThisVal); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + } + + if (isSub) + MI->setFlag(MachineInstr::FrameSetup); + + Offset -= ThisVal; + } +} + +/// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator. +static +void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + unsigned StackPtr, uint64_t *NumBytes = NULL) { + if (MBBI == MBB.begin()) return; + + MachineBasicBlock::iterator PI = prior(MBBI); + unsigned Opc = PI->getOpcode(); + if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || + Opc == X86::ADD32ri || Opc == X86::ADD32ri8 || + Opc == X86::LEA32r || Opc == X86::LEA64_32r) && + PI->getOperand(0).getReg() == StackPtr) { + if (NumBytes) + *NumBytes += PI->getOperand(2).getImm(); + MBB.erase(PI); + } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || + Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && + PI->getOperand(0).getReg() == StackPtr) { + if (NumBytes) + *NumBytes -= PI->getOperand(2).getImm(); + MBB.erase(PI); + } +} + +/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower iterator. +static +void mergeSPUpdatesDown(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + unsigned StackPtr, uint64_t *NumBytes = NULL) { + // FIXME: THIS ISN'T RUN!!! + return; + + if (MBBI == MBB.end()) return; + + MachineBasicBlock::iterator NI = llvm::next(MBBI); + if (NI == MBB.end()) return; + + unsigned Opc = NI->getOpcode(); + if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || + Opc == X86::ADD32ri || Opc == X86::ADD32ri8) && + NI->getOperand(0).getReg() == StackPtr) { + if (NumBytes) + *NumBytes -= NI->getOperand(2).getImm(); + MBB.erase(NI); + MBBI = NI; + } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || + Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && + NI->getOperand(0).getReg() == StackPtr) { + if (NumBytes) + *NumBytes += NI->getOperand(2).getImm(); + MBB.erase(NI); + MBBI = NI; + } +} + +/// mergeSPUpdates - Checks the instruction before/after the passed +/// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and the +/// stack adjustment is returned as a positive value for ADD/LEA and a negative for +/// SUB. +static int mergeSPUpdates(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + unsigned StackPtr, + bool doMergeWithPrevious) { + if ((doMergeWithPrevious && MBBI == MBB.begin()) || + (!doMergeWithPrevious && MBBI == MBB.end())) + return 0; + + MachineBasicBlock::iterator PI = doMergeWithPrevious ? prior(MBBI) : MBBI; + MachineBasicBlock::iterator NI = doMergeWithPrevious ? 0 : llvm::next(MBBI); + unsigned Opc = PI->getOpcode(); + int Offset = 0; + + if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || + Opc == X86::ADD32ri || Opc == X86::ADD32ri8 || + Opc == X86::LEA32r || Opc == X86::LEA64_32r) && + PI->getOperand(0).getReg() == StackPtr){ + Offset += PI->getOperand(2).getImm(); + MBB.erase(PI); + if (!doMergeWithPrevious) MBBI = NI; + } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || + Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && + PI->getOperand(0).getReg() == StackPtr) { + Offset -= PI->getOperand(2).getImm(); + MBB.erase(PI); + if (!doMergeWithPrevious) MBBI = NI; + } + + return Offset; +} + +static bool isEAXLiveIn(MachineFunction &MF) { + for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(), + EE = MF.getRegInfo().livein_end(); II != EE; ++II) { + unsigned Reg = II->first; + + if (Reg == X86::EAX || Reg == X86::AX || + Reg == X86::AH || Reg == X86::AL) + return true; + } + + return false; +} + +void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF, + MCSymbol *Label, + unsigned FramePtr) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineModuleInfo &MMI = MF.getMMI(); + + // Add callee saved registers to move list. + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + if (CSI.empty()) return; + + std::vector<MachineMove> &Moves = MMI.getFrameMoves(); + const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); + bool HasFP = hasFP(MF); + + // Calculate amount of bytes used for return address storing. + int stackGrowth = -RegInfo->getSlotSize(); + + // FIXME: This is dirty hack. The code itself is pretty mess right now. + // It should be rewritten from scratch and generalized sometimes. + + // Determine maximum offset (minimum due to stack growth). + int64_t MaxOffset = 0; + for (std::vector<CalleeSavedInfo>::const_iterator + I = CSI.begin(), E = CSI.end(); I != E; ++I) + MaxOffset = std::min(MaxOffset, + MFI->getObjectOffset(I->getFrameIdx())); + + // Calculate offsets. + int64_t saveAreaOffset = (HasFP ? 3 : 2) * stackGrowth; + for (std::vector<CalleeSavedInfo>::const_iterator + I = CSI.begin(), E = CSI.end(); I != E; ++I) { + int64_t Offset = MFI->getObjectOffset(I->getFrameIdx()); + unsigned Reg = I->getReg(); + Offset = MaxOffset - Offset + saveAreaOffset; + + // Don't output a new machine move if we're re-saving the frame + // pointer. This happens when the PrologEpilogInserter has inserted an extra + // "PUSH" of the frame pointer -- the "emitPrologue" method automatically + // generates one when frame pointers are used. If we generate a "machine + // move" for this extra "PUSH", the linker will lose track of the fact that + // the frame pointer should have the value of the first "PUSH" when it's + // trying to unwind. + // + // FIXME: This looks inelegant. It's possibly correct, but it's covering up + // another bug. I.e., one where we generate a prolog like this: + // + // pushl %ebp + // movl %esp, %ebp + // pushl %ebp + // pushl %esi + // ... + // + // The immediate re-push of EBP is unnecessary. At the least, it's an + // optimization bug. EBP can be used as a scratch register in certain + // cases, but probably not when we have a frame pointer. + if (HasFP && FramePtr == Reg) + continue; + + MachineLocation CSDst(MachineLocation::VirtualFP, Offset); + MachineLocation CSSrc(Reg); + Moves.push_back(MachineMove(Label, CSDst, CSSrc)); + } +} + +/// getCompactUnwindRegNum - Get the compact unwind number for a given +/// register. The number corresponds to the enum lists in +/// compact_unwind_encoding.h. +static int getCompactUnwindRegNum(unsigned Reg, bool is64Bit) { + static const uint16_t CU32BitRegs[] = { + X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0 + }; + static const uint16_t CU64BitRegs[] = { + X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0 + }; + const uint16_t *CURegs = is64Bit ? CU64BitRegs : CU32BitRegs; + for (int Idx = 1; *CURegs; ++CURegs, ++Idx) + if (*CURegs == Reg) + return Idx; + + return -1; +} + +// Number of registers that can be saved in a compact unwind encoding. +#define CU_NUM_SAVED_REGS 6 + +/// encodeCompactUnwindRegistersWithoutFrame - Create the permutation encoding +/// used with frameless stacks. It is passed the number of registers to be saved +/// and an array of the registers saved. +static uint32_t +encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS], + unsigned RegCount, bool Is64Bit) { + // The saved registers are numbered from 1 to 6. In order to encode the order + // in which they were saved, we re-number them according to their place in the + // register order. The re-numbering is relative to the last re-numbered + // register. E.g., if we have registers {6, 2, 4, 5} saved in that order: + // + // Orig Re-Num + // ---- ------ + // 6 6 + // 2 2 + // 4 3 + // 5 3 + // + for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) { + int CUReg = getCompactUnwindRegNum(SavedRegs[i], Is64Bit); + if (CUReg == -1) return ~0U; + SavedRegs[i] = CUReg; + } + + // Reverse the list. + std::swap(SavedRegs[0], SavedRegs[5]); + std::swap(SavedRegs[1], SavedRegs[4]); + std::swap(SavedRegs[2], SavedRegs[3]); + + uint32_t RenumRegs[CU_NUM_SAVED_REGS]; + for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i) { + unsigned Countless = 0; + for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j) + if (SavedRegs[j] < SavedRegs[i]) + ++Countless; + + RenumRegs[i] = SavedRegs[i] - Countless - 1; + } + + // Take the renumbered values and encode them into a 10-bit number. + uint32_t permutationEncoding = 0; + switch (RegCount) { + case 6: + permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1] + + 6 * RenumRegs[2] + 2 * RenumRegs[3] + + RenumRegs[4]; + break; + case 5: + permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2] + + 6 * RenumRegs[3] + 2 * RenumRegs[4] + + RenumRegs[5]; + break; + case 4: + permutationEncoding |= 60 * RenumRegs[2] + 12 * RenumRegs[3] + + 3 * RenumRegs[4] + RenumRegs[5]; + break; + case 3: + permutationEncoding |= 20 * RenumRegs[3] + 4 * RenumRegs[4] + + RenumRegs[5]; + break; + case 2: + permutationEncoding |= 5 * RenumRegs[4] + RenumRegs[5]; + break; + case 1: + permutationEncoding |= RenumRegs[5]; + break; + } + + assert((permutationEncoding & 0x3FF) == permutationEncoding && + "Invalid compact register encoding!"); + return permutationEncoding; +} + +/// encodeCompactUnwindRegistersWithFrame - Return the registers encoded for a +/// compact encoding with a frame pointer. +static uint32_t +encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS], + bool Is64Bit) { + // Encode the registers in the order they were saved, 3-bits per register. The + // registers are numbered from 1 to CU_NUM_SAVED_REGS. + uint32_t RegEnc = 0; + for (int I = CU_NUM_SAVED_REGS - 1, Idx = 0; I != -1; --I) { + unsigned Reg = SavedRegs[I]; + if (Reg == 0) continue; + + int CURegNum = getCompactUnwindRegNum(Reg, Is64Bit); + if (CURegNum == -1) return ~0U; + + // Encode the 3-bit register number in order, skipping over 3-bits for each + // register. + RegEnc |= (CURegNum & 0x7) << (Idx++ * 3); + } + + assert((RegEnc & 0x3FFFF) == RegEnc && "Invalid compact register encoding!"); + return RegEnc; +} + +uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { + const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); + unsigned FramePtr = RegInfo->getFrameRegister(MF); + unsigned StackPtr = RegInfo->getStackRegister(); + + bool Is64Bit = STI.is64Bit(); + bool HasFP = hasFP(MF); + + unsigned SavedRegs[CU_NUM_SAVED_REGS] = { 0, 0, 0, 0, 0, 0 }; + unsigned SavedRegIdx = 0; + + unsigned OffsetSize = (Is64Bit ? 8 : 4); + + unsigned PushInstr = (Is64Bit ? X86::PUSH64r : X86::PUSH32r); + unsigned PushInstrSize = 1; + unsigned MoveInstr = (Is64Bit ? X86::MOV64rr : X86::MOV32rr); + unsigned MoveInstrSize = (Is64Bit ? 3 : 2); + unsigned SubtractInstrIdx = (Is64Bit ? 3 : 2); + + unsigned StackDivide = (Is64Bit ? 8 : 4); + + unsigned InstrOffset = 0; + unsigned StackAdjust = 0; + unsigned StackSize = 0; + + MachineBasicBlock &MBB = MF.front(); // Prologue is in entry BB. + bool ExpectEnd = false; + for (MachineBasicBlock::iterator + MBBI = MBB.begin(), MBBE = MBB.end(); MBBI != MBBE; ++MBBI) { + MachineInstr &MI = *MBBI; + unsigned Opc = MI.getOpcode(); + if (Opc == X86::PROLOG_LABEL) continue; + if (!MI.getFlag(MachineInstr::FrameSetup)) break; + + // We don't exect any more prolog instructions. + if (ExpectEnd) return CU::UNWIND_MODE_DWARF; + + if (Opc == PushInstr) { + // If there are too many saved registers, we cannot use compact encoding. + if (SavedRegIdx >= CU_NUM_SAVED_REGS) return CU::UNWIND_MODE_DWARF; + + unsigned Reg = MI.getOperand(0).getReg(); + if (Reg == (Is64Bit ? X86::RAX : X86::EAX)) { + ExpectEnd = true; + continue; + } + + SavedRegs[SavedRegIdx++] = MI.getOperand(0).getReg(); + StackAdjust += OffsetSize; + InstrOffset += PushInstrSize; + } else if (Opc == MoveInstr) { + unsigned SrcReg = MI.getOperand(1).getReg(); + unsigned DstReg = MI.getOperand(0).getReg(); + + if (DstReg != FramePtr || SrcReg != StackPtr) + return CU::UNWIND_MODE_DWARF; + + StackAdjust = 0; + memset(SavedRegs, 0, sizeof(SavedRegs)); + SavedRegIdx = 0; + InstrOffset += MoveInstrSize; + } else if (Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || + Opc == X86::SUB32ri || Opc == X86::SUB32ri8) { + if (StackSize) + // We already have a stack size. + return CU::UNWIND_MODE_DWARF; + + if (!MI.getOperand(0).isReg() || + MI.getOperand(0).getReg() != MI.getOperand(1).getReg() || + MI.getOperand(0).getReg() != StackPtr || !MI.getOperand(2).isImm()) + // We need this to be a stack adjustment pointer. Something like: + // + // %RSP<def> = SUB64ri8 %RSP, 48 + return CU::UNWIND_MODE_DWARF; + + StackSize = MI.getOperand(2).getImm() / StackDivide; + SubtractInstrIdx += InstrOffset; + ExpectEnd = true; + } + } + + // Encode that we are using EBP/RBP as the frame pointer. + uint32_t CompactUnwindEncoding = 0; + StackAdjust /= StackDivide; + if (HasFP) { + if ((StackAdjust & 0xFF) != StackAdjust) + // Offset was too big for compact encoding. + return CU::UNWIND_MODE_DWARF; + + // Get the encoding of the saved registers when we have a frame pointer. + uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame(SavedRegs, Is64Bit); + if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF; + + CompactUnwindEncoding |= CU::UNWIND_MODE_BP_FRAME; + CompactUnwindEncoding |= (StackAdjust & 0xFF) << 16; + CompactUnwindEncoding |= RegEnc & CU::UNWIND_BP_FRAME_REGISTERS; + } else { + ++StackAdjust; + uint32_t TotalStackSize = StackAdjust + StackSize; + if ((TotalStackSize & 0xFF) == TotalStackSize) { + // Frameless stack with a small stack size. + CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IMMD; + + // Encode the stack size. + CompactUnwindEncoding |= (TotalStackSize & 0xFF) << 16; + } else { + if ((StackAdjust & 0x7) != StackAdjust) + // The extra stack adjustments are too big for us to handle. + return CU::UNWIND_MODE_DWARF; + + // Frameless stack with an offset too large for us to encode compactly. + CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IND; + + // Encode the offset to the nnnnnn value in the 'subl $nnnnnn, ESP' + // instruction. + CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16; + + // Encode any extra stack stack adjustments (done via push instructions). + CompactUnwindEncoding |= (StackAdjust & 0x7) << 13; + } + + // Encode the number of registers saved. + CompactUnwindEncoding |= (SavedRegIdx & 0x7) << 10; + + // Get the encoding of the saved registers when we don't have a frame + // pointer. + uint32_t RegEnc = + encodeCompactUnwindRegistersWithoutFrame(SavedRegs, SavedRegIdx, + Is64Bit); + if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF; + + // Encode the register encoding. + CompactUnwindEncoding |= + RegEnc & CU::UNWIND_FRAMELESS_STACK_REG_PERMUTATION; + } + + return CompactUnwindEncoding; +} + +/// usesTheStack - This function checks if any of the users of EFLAGS +/// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has +/// to use the stack, and if we don't adjust the stack we clobber the first +/// frame index. +/// See X86InstrInfo::copyPhysReg. +static bool usesTheStack(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + + for (MachineRegisterInfo::reg_iterator ri = MRI.reg_begin(X86::EFLAGS), + re = MRI.reg_end(); ri != re; ++ri) + if (ri->isCopy()) + return true; + + return false; +} + +/// emitPrologue - Push callee-saved registers onto the stack, which +/// automatically adjust the stack pointer. Adjust the stack pointer to allocate +/// space for local variables. Also emit labels used by the exception handler to +/// generate the exception handling frames. +void X86FrameLowering::emitPrologue(MachineFunction &MF) const { + MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB. + MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + const Function *Fn = MF.getFunction(); + const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); + const X86InstrInfo &TII = *TM.getInstrInfo(); + MachineModuleInfo &MMI = MF.getMMI(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + bool needsFrameMoves = MMI.hasDebugInfo() || + Fn->needsUnwindTableEntry(); + uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. + uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate. + bool HasFP = hasFP(MF); + bool Is64Bit = STI.is64Bit(); + bool IsLP64 = STI.isTarget64BitLP64(); + bool IsWin64 = STI.isTargetWin64(); + bool UseLEA = STI.useLeaForSP(); + unsigned StackAlign = getStackAlignment(); + unsigned SlotSize = RegInfo->getSlotSize(); + unsigned FramePtr = RegInfo->getFrameRegister(MF); + unsigned StackPtr = RegInfo->getStackRegister(); + unsigned BasePtr = RegInfo->getBaseRegister(); + DebugLoc DL; + + // If we're forcing a stack realignment we can't rely on just the frame + // info, we need to know the ABI stack alignment as well in case we + // have a call out. Otherwise just make sure we have some alignment - we'll + // go with the minimum SlotSize. + if (ForceStackAlign) { + if (MFI->hasCalls()) + MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; + else if (MaxAlign < SlotSize) + MaxAlign = SlotSize; + } + + // Add RETADDR move area to callee saved frame size. + int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); + if (TailCallReturnAddrDelta < 0) + X86FI->setCalleeSavedFrameSize( + X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); + + // If this is x86-64 and the Red Zone is not disabled, if we are a leaf + // function, and use up to 128 bytes of stack space, don't have a frame + // pointer, calls, or dynamic alloca then we do not need to adjust the + // stack pointer (we fit in the Red Zone). We also check that we don't + // push and pop from the stack. + if (Is64Bit && !Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::NoRedZone) && + !RegInfo->needsStackRealignment(MF) && + !MFI->hasVarSizedObjects() && // No dynamic alloca. + !MFI->adjustsStack() && // No calls. + !IsWin64 && // Win64 has no Red Zone + !usesTheStack(MF) && // Don't push and pop. + !MF.getTarget().Options.EnableSegmentedStacks) { // Regular stack + uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); + if (HasFP) MinSize += SlotSize; + StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); + MFI->setStackSize(StackSize); + } + + // Insert stack pointer adjustment for later moving of return addr. Only + // applies to tail call optimized functions where the callee argument stack + // size is bigger than the callers. + if (TailCallReturnAddrDelta < 0) { + MachineInstr *MI = + BuildMI(MBB, MBBI, DL, + TII.get(getSUBriOpcode(IsLP64, -TailCallReturnAddrDelta)), + StackPtr) + .addReg(StackPtr) + .addImm(-TailCallReturnAddrDelta) + .setMIFlag(MachineInstr::FrameSetup); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + } + + // Mapping for machine moves: + // + // DST: VirtualFP AND + // SRC: VirtualFP => DW_CFA_def_cfa_offset + // ELSE => DW_CFA_def_cfa + // + // SRC: VirtualFP AND + // DST: Register => DW_CFA_def_cfa_register + // + // ELSE + // OFFSET < 0 => DW_CFA_offset_extended_sf + // REG < 64 => DW_CFA_offset + Reg + // ELSE => DW_CFA_offset_extended + + std::vector<MachineMove> &Moves = MMI.getFrameMoves(); + uint64_t NumBytes = 0; + int stackGrowth = -SlotSize; + + if (HasFP) { + // Calculate required stack adjustment. + uint64_t FrameSize = StackSize - SlotSize; + if (RegInfo->needsStackRealignment(MF)) { + // Callee-saved registers are pushed on stack before the stack + // is realigned. + FrameSize -= X86FI->getCalleeSavedFrameSize(); + NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign; + } else { + NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize(); + } + + // Get the offset of the stack slot for the EBP register, which is + // guaranteed to be the last slot by processFunctionBeforeFrameFinalized. + // Update the frame offset adjustment. + MFI->setOffsetAdjustment(-NumBytes); + + // Save EBP/RBP into the appropriate stack slot. + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) + .addReg(FramePtr, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); + + if (needsFrameMoves) { + // Mark the place where EBP/RBP was saved. + MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol(); + BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)) + .addSym(FrameLabel); + + // Define the current CFA rule to use the provided offset. + if (StackSize) { + MachineLocation SPDst(MachineLocation::VirtualFP); + MachineLocation SPSrc(MachineLocation::VirtualFP, 2 * stackGrowth); + Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc)); + } else { + MachineLocation SPDst(StackPtr); + MachineLocation SPSrc(StackPtr, stackGrowth); + Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc)); + } + + // Change the rule for the FramePtr to be an "offset" rule. + MachineLocation FPDst(MachineLocation::VirtualFP, 2 * stackGrowth); + MachineLocation FPSrc(FramePtr); + Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc)); + } + + // Update EBP with the new base value. + BuildMI(MBB, MBBI, DL, + TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr) + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); + + if (needsFrameMoves) { + // Mark effective beginning of when frame pointer becomes valid. + MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol(); + BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)) + .addSym(FrameLabel); + + // Define the current CFA to use the EBP/RBP register. + MachineLocation FPDst(FramePtr); + MachineLocation FPSrc(MachineLocation::VirtualFP); + Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc)); + } + + // Mark the FramePtr as live-in in every block except the entry. + for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end(); + I != E; ++I) + I->addLiveIn(FramePtr); + } else { + NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); + } + + // Skip the callee-saved push instructions. + bool PushedRegs = false; + int StackOffset = 2 * stackGrowth; + + while (MBBI != MBB.end() && + (MBBI->getOpcode() == X86::PUSH32r || + MBBI->getOpcode() == X86::PUSH64r)) { + PushedRegs = true; + MBBI->setFlag(MachineInstr::FrameSetup); + ++MBBI; + + if (!HasFP && needsFrameMoves) { + // Mark callee-saved push instruction. + MCSymbol *Label = MMI.getContext().CreateTempSymbol(); + BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(Label); + + // Define the current CFA rule to use the provided offset. + unsigned Ptr = StackSize ? MachineLocation::VirtualFP : StackPtr; + MachineLocation SPDst(Ptr); + MachineLocation SPSrc(Ptr, StackOffset); + Moves.push_back(MachineMove(Label, SPDst, SPSrc)); + StackOffset += stackGrowth; + } + } + + // Realign stack after we pushed callee-saved registers (so that we'll be + // able to calculate their offsets from the frame pointer). + + // NOTE: We push the registers before realigning the stack, so + // vector callee-saved (xmm) registers may be saved w/o proper + // alignment in this way. However, currently these regs are saved in + // stack slots (see X86FrameLowering::spillCalleeSavedRegisters()), so + // this shouldn't be a problem. + if (RegInfo->needsStackRealignment(MF)) { + assert(HasFP && "There should be a frame pointer if stack is realigned."); + MachineInstr *MI = + BuildMI(MBB, MBBI, DL, + TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri), StackPtr) + .addReg(StackPtr) + .addImm(-MaxAlign) + .setMIFlag(MachineInstr::FrameSetup); + + // The EFLAGS implicit def is dead. + MI->getOperand(3).setIsDead(); + } + + // If there is an SUB32ri of ESP immediately before this instruction, merge + // the two. This can be the case when tail call elimination is enabled and + // the callee has more arguments then the caller. + NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true); + + // If there is an ADD32ri or SUB32ri of ESP immediately after this + // instruction, merge the two instructions. + mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes); + + // Adjust stack pointer: ESP -= numbytes. + + // Windows and cygwin/mingw require a prologue helper routine when allocating + // more than 4K bytes on the stack. Windows uses __chkstk and cygwin/mingw + // uses __alloca. __alloca and the 32-bit version of __chkstk will probe the + // stack and adjust the stack pointer in one go. The 64-bit version of + // __chkstk is only responsible for probing the stack. The 64-bit prologue is + // responsible for adjusting the stack pointer. Touching the stack at 4K + // increments is necessary to ensure that the guard pages used by the OS + // virtual memory manager are allocated in correct sequence. + if (NumBytes >= 4096 && STI.isTargetCOFF() && !STI.isTargetEnvMacho()) { + const char *StackProbeSymbol; + bool isSPUpdateNeeded = false; + + if (Is64Bit) { + if (STI.isTargetCygMing()) + StackProbeSymbol = "___chkstk"; + else { + StackProbeSymbol = "__chkstk"; + isSPUpdateNeeded = true; + } + } else if (STI.isTargetCygMing()) + StackProbeSymbol = "_alloca"; + else + StackProbeSymbol = "_chkstk"; + + // Check whether EAX is livein for this function. + bool isEAXAlive = isEAXLiveIn(MF); + + if (isEAXAlive) { + // Sanity check that EAX is not livein for this function. + // It should not be, so throw an assert. + assert(!Is64Bit && "EAX is livein in x64 case!"); + + // Save EAX + BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r)) + .addReg(X86::EAX, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); + } + + if (Is64Bit) { + // Handle the 64-bit Windows ABI case where we need to call __chkstk. + // Function prologue is responsible for adjusting the stack pointer. + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } else { + // Allocate NumBytes-4 bytes on stack in case of isEAXAlive. + // We'll also use 4 already allocated bytes for EAX. + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) + .addImm(isEAXAlive ? NumBytes - 4 : NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } + + BuildMI(MBB, MBBI, DL, + TII.get(Is64Bit ? X86::W64ALLOCA : X86::CALLpcrel32)) + .addExternalSymbol(StackProbeSymbol) + .addReg(StackPtr, RegState::Define | RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit) + .setMIFlag(MachineInstr::FrameSetup); + + // MSVC x64's __chkstk needs to adjust %rsp. + // FIXME: %rax preserves the offset and should be available. + if (isSPUpdateNeeded) + emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64, + UseLEA, TII, *RegInfo); + + if (isEAXAlive) { + // Restore EAX + MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), + X86::EAX), + StackPtr, false, NumBytes - 4); + MI->setFlag(MachineInstr::FrameSetup); + MBB.insert(MBBI, MI); + } + } else if (NumBytes) + emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64, + UseLEA, TII, *RegInfo); + + // If we need a base pointer, set it up here. It's whatever the value + // of the stack pointer is at this point. Any variable size objects + // will be allocated after this, so we can still use the base pointer + // to reference locals. + if (RegInfo->hasBasePointer(MF)) { + // Update the frame pointer with the current stack pointer. + unsigned Opc = Is64Bit ? X86::MOV64rr : X86::MOV32rr; + BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr) + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); + } + + if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) { + // Mark end of stack pointer adjustment. + MCSymbol *Label = MMI.getContext().CreateTempSymbol(); + BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)) + .addSym(Label); + + if (!HasFP && NumBytes) { + // Define the current CFA rule to use the provided offset. + if (StackSize) { + MachineLocation SPDst(MachineLocation::VirtualFP); + MachineLocation SPSrc(MachineLocation::VirtualFP, + -StackSize + stackGrowth); + Moves.push_back(MachineMove(Label, SPDst, SPSrc)); + } else { + MachineLocation SPDst(StackPtr); + MachineLocation SPSrc(StackPtr, stackGrowth); + Moves.push_back(MachineMove(Label, SPDst, SPSrc)); + } + } + + // Emit DWARF info specifying the offsets of the callee-saved registers. + if (PushedRegs) + emitCalleeSavedFrameMoves(MF, Label, HasFP ? FramePtr : StackPtr); + } + + // Darwin 10.7 and greater has support for compact unwind encoding. + if (STI.getTargetTriple().isMacOSX() && + !STI.getTargetTriple().isMacOSXVersionLT(10, 7)) + MMI.setCompactUnwindEncoding(getCompactUnwindEncoding(MF)); +} + +void X86FrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); + const X86InstrInfo &TII = *TM.getInstrInfo(); + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + assert(MBBI != MBB.end() && "Returning block has no instructions"); + unsigned RetOpcode = MBBI->getOpcode(); + DebugLoc DL = MBBI->getDebugLoc(); + bool Is64Bit = STI.is64Bit(); + bool IsLP64 = STI.isTarget64BitLP64(); + bool UseLEA = STI.useLeaForSP(); + unsigned StackAlign = getStackAlignment(); + unsigned SlotSize = RegInfo->getSlotSize(); + unsigned FramePtr = RegInfo->getFrameRegister(MF); + unsigned StackPtr = RegInfo->getStackRegister(); + + switch (RetOpcode) { + default: + llvm_unreachable("Can only insert epilog into returning blocks"); + case X86::RET: + case X86::RETI: + case X86::TCRETURNdi: + case X86::TCRETURNri: + case X86::TCRETURNmi: + case X86::TCRETURNdi64: + case X86::TCRETURNri64: + case X86::TCRETURNmi64: + case X86::EH_RETURN: + case X86::EH_RETURN64: + break; // These are ok + } + + // Get the number of bytes to allocate from the FrameInfo. + uint64_t StackSize = MFI->getStackSize(); + uint64_t MaxAlign = MFI->getMaxAlignment(); + unsigned CSSize = X86FI->getCalleeSavedFrameSize(); + uint64_t NumBytes = 0; + + // If we're forcing a stack realignment we can't rely on just the frame + // info, we need to know the ABI stack alignment as well in case we + // have a call out. Otherwise just make sure we have some alignment - we'll + // go with the minimum. + if (ForceStackAlign) { + if (MFI->hasCalls()) + MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; + else + MaxAlign = MaxAlign ? MaxAlign : 4; + } + + if (hasFP(MF)) { + // Calculate required stack adjustment. + uint64_t FrameSize = StackSize - SlotSize; + if (RegInfo->needsStackRealignment(MF)) { + // Callee-saved registers were pushed on stack before the stack + // was realigned. + FrameSize -= CSSize; + NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign; + } else { + NumBytes = FrameSize - CSSize; + } + + // Pop EBP. + BuildMI(MBB, MBBI, DL, + TII.get(Is64Bit ? X86::POP64r : X86::POP32r), FramePtr); + } else { + NumBytes = StackSize - CSSize; + } + + // Skip the callee-saved pop instructions. + while (MBBI != MBB.begin()) { + MachineBasicBlock::iterator PI = prior(MBBI); + unsigned Opc = PI->getOpcode(); + + if (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::DBG_VALUE && + !PI->isTerminator()) + break; + + --MBBI; + } + MachineBasicBlock::iterator FirstCSPop = MBBI; + + DL = MBBI->getDebugLoc(); + + // If there is an ADD32ri or SUB32ri of ESP immediately before this + // instruction, merge the two instructions. + if (NumBytes || MFI->hasVarSizedObjects()) + mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes); + + // If dynamic alloca is used, then reset esp to point to the last callee-saved + // slot before popping them off! Same applies for the case, when stack was + // realigned. + if (RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) { + if (RegInfo->needsStackRealignment(MF)) + MBBI = FirstCSPop; + if (CSSize != 0) { + unsigned Opc = getLEArOpcode(IsLP64); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), + FramePtr, false, -CSSize); + } else { + unsigned Opc = (Is64Bit ? X86::MOV64rr : X86::MOV32rr); + BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) + .addReg(FramePtr); + } + } else if (NumBytes) { + // Adjust stack pointer back: ESP += numbytes. + emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, IsLP64, UseLEA, + TII, *RegInfo); + } + + // We're returning from function via eh_return. + if (RetOpcode == X86::EH_RETURN || RetOpcode == X86::EH_RETURN64) { + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &DestAddr = MBBI->getOperand(0); + assert(DestAddr.isReg() && "Offset should be in register!"); + BuildMI(MBB, MBBI, DL, + TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), + StackPtr).addReg(DestAddr.getReg()); + } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi || + RetOpcode == X86::TCRETURNmi || + RetOpcode == X86::TCRETURNri64 || RetOpcode == X86::TCRETURNdi64 || + RetOpcode == X86::TCRETURNmi64) { + bool isMem = RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64; + // Tail call return: adjust the stack pointer and jump to callee. + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1); + assert(StackAdjust.isImm() && "Expecting immediate value."); + + // Adjust stack pointer. + int StackAdj = StackAdjust.getImm(); + int MaxTCDelta = X86FI->getTCReturnAddrDelta(); + int Offset = 0; + assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive"); + + // Incoporate the retaddr area. + Offset = StackAdj-MaxTCDelta; + assert(Offset >= 0 && "Offset should never be negative"); + + if (Offset) { + // Check for possible merge with preceding ADD instruction. + Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true); + emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, IsLP64, + UseLEA, TII, *RegInfo); + } + + // Jump to label or value in register. + if (RetOpcode == X86::TCRETURNdi || RetOpcode == X86::TCRETURNdi64) { + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNdi) + ? X86::TAILJMPd : X86::TAILJMPd64)); + if (JumpTarget.isGlobal()) + MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), + JumpTarget.getTargetFlags()); + else { + assert(JumpTarget.isSymbol()); + MIB.addExternalSymbol(JumpTarget.getSymbolName(), + JumpTarget.getTargetFlags()); + } + } else if (RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64) { + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNmi) + ? X86::TAILJMPm : X86::TAILJMPm64)); + for (unsigned i = 0; i != 5; ++i) + MIB.addOperand(MBBI->getOperand(i)); + } else if (RetOpcode == X86::TCRETURNri64) { + BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr64)). + addReg(JumpTarget.getReg(), RegState::Kill); + } else { + BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr)). + addReg(JumpTarget.getReg(), RegState::Kill); + } + + MachineInstr *NewMI = prior(MBBI); + NewMI->copyImplicitOps(MF, MBBI); + + // Delete the pseudo instruction TCRETURN. + MBB.erase(MBBI); + } else if ((RetOpcode == X86::RET || RetOpcode == X86::RETI) && + (X86FI->getTCReturnAddrDelta() < 0)) { + // Add the return addr area delta back since we are not tail calling. + int delta = -1*X86FI->getTCReturnAddrDelta(); + MBBI = MBB.getLastNonDebugInstr(); + + // Check for possible merge with preceding ADD instruction. + delta += mergeSPUpdates(MBB, MBBI, StackPtr, true); + emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, IsLP64, UseLEA, TII, + *RegInfo); + } +} + +int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const { + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo()); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); + uint64_t StackSize = MFI->getStackSize(); + + if (RegInfo->hasBasePointer(MF)) { + assert (hasFP(MF) && "VLAs and dynamic stack realign, but no FP?!"); + if (FI < 0) { + // Skip the saved EBP. + return Offset + RegInfo->getSlotSize(); + } else { + assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); + return Offset + StackSize; + } + } else if (RegInfo->needsStackRealignment(MF)) { + if (FI < 0) { + // Skip the saved EBP. + return Offset + RegInfo->getSlotSize(); + } else { + assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); + return Offset + StackSize; + } + // FIXME: Support tail calls + } else { + if (!hasFP(MF)) + return Offset + StackSize; + + // Skip the saved EBP. + Offset += RegInfo->getSlotSize(); + + // Skip the RETADDR move area + const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); + if (TailCallReturnAddrDelta < 0) + Offset -= TailCallReturnAddrDelta; + } + + return Offset; +} + +int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const { + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo()); + // We can't calculate offset from frame pointer if the stack is realigned, + // so enforce usage of stack/base pointer. The base pointer is used when we + // have dynamic allocas in addition to dynamic realignment. + if (RegInfo->hasBasePointer(MF)) + FrameReg = RegInfo->getBaseRegister(); + else if (RegInfo->needsStackRealignment(MF)) + FrameReg = RegInfo->getStackRegister(); + else + FrameReg = RegInfo->getFrameRegister(MF); + return getFrameIndexOffset(MF, FI); +} + +bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return false; + + DebugLoc DL = MBB.findDebugLoc(MI); + + MachineFunction &MF = *MBB.getParent(); + + unsigned SlotSize = STI.is64Bit() ? 8 : 4; + unsigned FPReg = TRI->getFrameRegister(MF); + unsigned CalleeFrameSize = 0; + + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + + // Push GPRs. It increases frame size. + unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + if (!X86::GR64RegClass.contains(Reg) && + !X86::GR32RegClass.contains(Reg)) + continue; + // Add the callee-saved register as live-in. It's killed at the spill. + MBB.addLiveIn(Reg); + if (Reg == FPReg) + // X86RegisterInfo::emitPrologue will handle spilling of frame register. + continue; + CalleeFrameSize += SlotSize; + BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); + } + + X86FI->setCalleeSavedFrameSize(CalleeFrameSize); + + // Make XMM regs spilled. X86 does not have ability of push/pop XMM. + // It can be done by spilling XMMs to stack frame. + // Note that only Win64 ABI might spill XMMs. + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + if (X86::GR64RegClass.contains(Reg) || + X86::GR32RegClass.contains(Reg)) + continue; + // Add the callee-saved register as live-in. It's killed at the spill. + MBB.addLiveIn(Reg); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(), + RC, TRI); + } + + return true; +} + +bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return false; + + DebugLoc DL = MBB.findDebugLoc(MI); + + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + + // Reload XMMs from stack frame. + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + if (X86::GR64RegClass.contains(Reg) || + X86::GR32RegClass.contains(Reg)) + continue; + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), + RC, TRI); + } + + // POP GPRs. + unsigned FPReg = TRI->getFrameRegister(MF); + unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r; + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + if (!X86::GR64RegClass.contains(Reg) && + !X86::GR32RegClass.contains(Reg)) + continue; + if (Reg == FPReg) + // X86RegisterInfo::emitEpilogue will handle restoring of frame register. + continue; + BuildMI(MBB, MI, DL, TII.get(Opc), Reg); + } + return true; +} + +void +X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); + unsigned SlotSize = RegInfo->getSlotSize(); + + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + int32_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); + + if (TailCallReturnAddrDelta < 0) { + // create RETURNADDR area + // arg + // arg + // RETADDR + // { ... + // RETADDR area + // ... + // } + // [EBP] + MFI->CreateFixedObject(-TailCallReturnAddrDelta, + (-1U*SlotSize)+TailCallReturnAddrDelta, true); + } + + if (hasFP(MF)) { + assert((TailCallReturnAddrDelta <= 0) && + "The Delta should always be zero or negative"); + const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering(); + + // Create a frame entry for the EBP register that must be saved. + int FrameIdx = MFI->CreateFixedObject(SlotSize, + -(int)SlotSize + + TFI.getOffsetOfLocalArea() + + TailCallReturnAddrDelta, + true); + assert(FrameIdx == MFI->getObjectIndexBegin() && + "Slot for EBP register must be last in order to be found!"); + (void)FrameIdx; + } + + // Spill the BasePtr if it's used. + if (RegInfo->hasBasePointer(MF)) + MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister()); +} + +static bool +HasNestArgument(const MachineFunction *MF) { + const Function *F = MF->getFunction(); + for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); + I != E; I++) { + if (I->hasNestAttr()) + return true; + } + return false; +} + +/// GetScratchRegister - Get a temp register for performing work in the +/// segmented stack and the Erlang/HiPE stack prologue. Depending on platform +/// and the properties of the function either one or two registers will be +/// needed. Set primary to true for the first register, false for the second. +static unsigned +GetScratchRegister(bool Is64Bit, const MachineFunction &MF, bool Primary) { + CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv(); + + // Erlang stuff. + if (CallingConvention == CallingConv::HiPE) { + if (Is64Bit) + return Primary ? X86::R14 : X86::R13; + else + return Primary ? X86::EBX : X86::EDI; + } + + if (Is64Bit) + return Primary ? X86::R11 : X86::R12; + + bool IsNested = HasNestArgument(&MF); + + if (CallingConvention == CallingConv::X86_FastCall || + CallingConvention == CallingConv::Fast) { + if (IsNested) + report_fatal_error("Segmented stacks does not support fastcall with " + "nested function."); + return Primary ? X86::EAX : X86::ECX; + } + if (IsNested) + return Primary ? X86::EDX : X86::EAX; + return Primary ? X86::ECX : X86::EAX; +} + +// The stack limit in the TCB is set to this many bytes above the actual stack +// limit. +static const uint64_t kSplitStackAvailable = 256; + +void +X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { + MachineBasicBlock &prologueMBB = MF.front(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + const X86InstrInfo &TII = *TM.getInstrInfo(); + uint64_t StackSize; + bool Is64Bit = STI.is64Bit(); + unsigned TlsReg, TlsOffset; + DebugLoc DL; + + unsigned ScratchReg = GetScratchRegister(Is64Bit, MF, true); + assert(!MF.getRegInfo().isLiveIn(ScratchReg) && + "Scratch register is live-in"); + + if (MF.getFunction()->isVarArg()) + report_fatal_error("Segmented stacks do not support vararg functions."); + if (!STI.isTargetLinux() && !STI.isTargetDarwin() && + !STI.isTargetWin32() && !STI.isTargetFreeBSD()) + report_fatal_error("Segmented stacks not supported on this platform."); + + MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + bool IsNested = false; + + // We need to know if the function has a nest argument only in 64 bit mode. + if (Is64Bit) + IsNested = HasNestArgument(&MF); + + // The MOV R10, RAX needs to be in a different block, since the RET we emit in + // allocMBB needs to be last (terminating) instruction. + + for (MachineBasicBlock::livein_iterator i = prologueMBB.livein_begin(), + e = prologueMBB.livein_end(); i != e; i++) { + allocMBB->addLiveIn(*i); + checkMBB->addLiveIn(*i); + } + + if (IsNested) + allocMBB->addLiveIn(X86::R10); + + MF.push_front(allocMBB); + MF.push_front(checkMBB); + + // Eventually StackSize will be calculated by a link-time pass; which will + // also decide whether checking code needs to be injected into this particular + // prologue. + StackSize = MFI->getStackSize(); + + // When the frame size is less than 256 we just compare the stack + // boundary directly to the value of the stack pointer, per gcc. + bool CompareStackPointer = StackSize < kSplitStackAvailable; + + // Read the limit off the current stacklet off the stack_guard location. + if (Is64Bit) { + if (STI.isTargetLinux()) { + TlsReg = X86::FS; + TlsOffset = 0x70; + } else if (STI.isTargetDarwin()) { + TlsReg = X86::GS; + TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90. + } else if (STI.isTargetFreeBSD()) { + TlsReg = X86::FS; + TlsOffset = 0x18; + } else { + report_fatal_error("Segmented stacks not supported on this platform."); + } + + if (CompareStackPointer) + ScratchReg = X86::RSP; + else + BuildMI(checkMBB, DL, TII.get(X86::LEA64r), ScratchReg).addReg(X86::RSP) + .addImm(1).addReg(0).addImm(-StackSize).addReg(0); + + BuildMI(checkMBB, DL, TII.get(X86::CMP64rm)).addReg(ScratchReg) + .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg); + } else { + if (STI.isTargetLinux()) { + TlsReg = X86::GS; + TlsOffset = 0x30; + } else if (STI.isTargetDarwin()) { + TlsReg = X86::GS; + TlsOffset = 0x48 + 90*4; + } else if (STI.isTargetWin32()) { + TlsReg = X86::FS; + TlsOffset = 0x14; // pvArbitrary, reserved for application use + } else if (STI.isTargetFreeBSD()) { + report_fatal_error("Segmented stacks not supported on FreeBSD i386."); + } else { + report_fatal_error("Segmented stacks not supported on this platform."); + } + + if (CompareStackPointer) + ScratchReg = X86::ESP; + else + BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP) + .addImm(1).addReg(0).addImm(-StackSize).addReg(0); + + if (STI.isTargetLinux() || STI.isTargetWin32()) { + BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg) + .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg); + } else if (STI.isTargetDarwin()) { + + // TlsOffset doesn't fit into a mod r/m byte so we need an extra register + unsigned ScratchReg2; + bool SaveScratch2; + if (CompareStackPointer) { + // The primary scratch register is available for holding the TLS offset + ScratchReg2 = GetScratchRegister(Is64Bit, MF, true); + SaveScratch2 = false; + } else { + // Need to use a second register to hold the TLS offset + ScratchReg2 = GetScratchRegister(Is64Bit, MF, false); + + // Unfortunately, with fastcc the second scratch register may hold an arg + SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2); + } + + // If Scratch2 is live-in then it needs to be saved + assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) && + "Scratch register is live-in and not saved"); + + if (SaveScratch2) + BuildMI(checkMBB, DL, TII.get(X86::PUSH32r)) + .addReg(ScratchReg2, RegState::Kill); + + BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2) + .addImm(TlsOffset); + BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)) + .addReg(ScratchReg) + .addReg(ScratchReg2).addImm(1).addReg(0) + .addImm(0) + .addReg(TlsReg); + + if (SaveScratch2) + BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2); + } + } + + // This jump is taken if SP >= (Stacklet Limit + Stack Space required). + // It jumps to normal execution of the function body. + BuildMI(checkMBB, DL, TII.get(X86::JA_4)).addMBB(&prologueMBB); + + // On 32 bit we first push the arguments size and then the frame size. On 64 + // bit, we pass the stack frame size in r10 and the argument size in r11. + if (Is64Bit) { + // Functions with nested arguments use R10, so it needs to be saved across + // the call to _morestack + + if (IsNested) + BuildMI(allocMBB, DL, TII.get(X86::MOV64rr), X86::RAX).addReg(X86::R10); + + BuildMI(allocMBB, DL, TII.get(X86::MOV64ri), X86::R10) + .addImm(StackSize); + BuildMI(allocMBB, DL, TII.get(X86::MOV64ri), X86::R11) + .addImm(X86FI->getArgumentStackSize()); + MF.getRegInfo().setPhysRegUsed(X86::R10); + MF.getRegInfo().setPhysRegUsed(X86::R11); + } else { + BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) + .addImm(X86FI->getArgumentStackSize()); + BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) + .addImm(StackSize); + } + + // __morestack is in libgcc + if (Is64Bit) + BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32)) + .addExternalSymbol("__morestack"); + else + BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32)) + .addExternalSymbol("__morestack"); + + if (IsNested) + BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10)); + else + BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET)); + + allocMBB->addSuccessor(&prologueMBB); + + checkMBB->addSuccessor(allocMBB); + checkMBB->addSuccessor(&prologueMBB); + +#ifdef XDEBUG + MF.verify(); +#endif +} + +/// Erlang programs may need a special prologue to handle the stack size they +/// might need at runtime. That is because Erlang/OTP does not implement a C +/// stack but uses a custom implementation of hybrid stack/heap architecture. +/// (for more information see Eric Stenman's Ph.D. thesis: +/// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf) +/// +/// CheckStack: +/// temp0 = sp - MaxStack +/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart +/// OldStart: +/// ... +/// IncStack: +/// call inc_stack # doubles the stack space +/// temp0 = sp - MaxStack +/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart +void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { + const X86InstrInfo &TII = *TM.getInstrInfo(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + const unsigned SlotSize = TM.getRegisterInfo()->getSlotSize(); + const bool Is64Bit = STI.is64Bit(); + DebugLoc DL; + // HiPE-specific values + const unsigned HipeLeafWords = 24; + const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5; + const unsigned Guaranteed = HipeLeafWords * SlotSize; + unsigned CallerStkArity = MF.getFunction()->arg_size() > CCRegisteredArgs ? + MF.getFunction()->arg_size() - CCRegisteredArgs : 0; + unsigned MaxStack = MFI->getStackSize() + CallerStkArity*SlotSize + SlotSize; + + assert(STI.isTargetLinux() && + "HiPE prologue is only supported on Linux operating systems."); + + // Compute the largest caller's frame that is needed to fit the callees' + // frames. This 'MaxStack' is computed from: + // + // a) the fixed frame size, which is the space needed for all spilled temps, + // b) outgoing on-stack parameter areas, and + // c) the minimum stack space this function needs to make available for the + // functions it calls (a tunable ABI property). + if (MFI->hasCalls()) { + unsigned MoreStackForCalls = 0; + + for (MachineFunction::iterator MBBI = MF.begin(), MBBE = MF.end(); + MBBI != MBBE; ++MBBI) + for (MachineBasicBlock::iterator MI = MBBI->begin(), ME = MBBI->end(); + MI != ME; ++MI) { + if (!MI->isCall()) + continue; + + // Get callee operand. + const MachineOperand &MO = MI->getOperand(0); + + // Only take account of global function calls (no closures etc.). + if (!MO.isGlobal()) + continue; + + const Function *F = dyn_cast<Function>(MO.getGlobal()); + if (!F) + continue; + + // Do not update 'MaxStack' for primitive and built-in functions + // (encoded with names either starting with "erlang."/"bif_" or not + // having a ".", such as a simple <Module>.<Function>.<Arity>, or an + // "_", such as the BIF "suspend_0") as they are executed on another + // stack. + if (F->getName().find("erlang.") != StringRef::npos || + F->getName().find("bif_") != StringRef::npos || + F->getName().find_first_of("._") == StringRef::npos) + continue; + + unsigned CalleeStkArity = + F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0; + if (HipeLeafWords - 1 > CalleeStkArity) + MoreStackForCalls = std::max(MoreStackForCalls, + (HipeLeafWords - 1 - CalleeStkArity) * SlotSize); + } + MaxStack += MoreStackForCalls; + } + + // If the stack frame needed is larger than the guaranteed then runtime checks + // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue. + if (MaxStack > Guaranteed) { + MachineBasicBlock &prologueMBB = MF.front(); + MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock(); + + for (MachineBasicBlock::livein_iterator I = prologueMBB.livein_begin(), + E = prologueMBB.livein_end(); I != E; I++) { + stackCheckMBB->addLiveIn(*I); + incStackMBB->addLiveIn(*I); + } + + MF.push_front(incStackMBB); + MF.push_front(stackCheckMBB); + + unsigned ScratchReg, SPReg, PReg, SPLimitOffset; + unsigned LEAop, CMPop, CALLop; + if (Is64Bit) { + SPReg = X86::RSP; + PReg = X86::RBP; + LEAop = X86::LEA64r; + CMPop = X86::CMP64rm; + CALLop = X86::CALL64pcrel32; + SPLimitOffset = 0x90; + } else { + SPReg = X86::ESP; + PReg = X86::EBP; + LEAop = X86::LEA32r; + CMPop = X86::CMP32rm; + CALLop = X86::CALLpcrel32; + SPLimitOffset = 0x4c; + } + + ScratchReg = GetScratchRegister(Is64Bit, MF, true); + assert(!MF.getRegInfo().isLiveIn(ScratchReg) && + "HiPE prologue scratch register is live-in"); + + // Create new MBB for StackCheck: + addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg), + SPReg, false, -MaxStack); + // SPLimitOffset is in a fixed heap location (pointed by BP). + addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop)) + .addReg(ScratchReg), PReg, false, SPLimitOffset); + BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_4)).addMBB(&prologueMBB); + + // Create new MBB for IncStack: + BuildMI(incStackMBB, DL, TII.get(CALLop)). + addExternalSymbol("inc_stack_0"); + addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg), + SPReg, false, -MaxStack); + addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop)) + .addReg(ScratchReg), PReg, false, SPLimitOffset); + BuildMI(incStackMBB, DL, TII.get(X86::JLE_4)).addMBB(incStackMBB); + + stackCheckMBB->addSuccessor(&prologueMBB, 99); + stackCheckMBB->addSuccessor(incStackMBB, 1); + incStackMBB->addSuccessor(&prologueMBB, 99); + incStackMBB->addSuccessor(incStackMBB, 1); + } +#ifdef XDEBUG + MF.verify(); +#endif +} + +void X86FrameLowering:: +eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + const X86InstrInfo &TII = *TM.getInstrInfo(); + const X86RegisterInfo &RegInfo = *TM.getRegisterInfo(); + unsigned StackPtr = RegInfo.getStackRegister(); + bool reseveCallFrame = hasReservedCallFrame(MF); + int Opcode = I->getOpcode(); + bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode(); + bool IsLP64 = STI.isTarget64BitLP64(); + DebugLoc DL = I->getDebugLoc(); + uint64_t Amount = !reseveCallFrame ? I->getOperand(0).getImm() : 0; + uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0; + I = MBB.erase(I); + + if (!reseveCallFrame) { + // If the stack pointer can be changed after prologue, turn the + // adjcallstackup instruction into a 'sub ESP, <amt>' and the + // adjcallstackdown instruction into 'add ESP, <amt>' + // TODO: consider using push / pop instead of sub + store / add + if (Amount == 0) + return; + + // We need to keep the stack aligned properly. To do this, we round the + // amount of space needed for the outgoing arguments up to the next + // alignment boundary. + unsigned StackAlign = TM.getFrameLowering()->getStackAlignment(); + Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign; + + MachineInstr *New = 0; + if (Opcode == TII.getCallFrameSetupOpcode()) { + New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), + StackPtr) + .addReg(StackPtr) + .addImm(Amount); + } else { + assert(Opcode == TII.getCallFrameDestroyOpcode()); + + // Factor out the amount the callee already popped. + Amount -= CalleeAmt; + + if (Amount) { + unsigned Opc = getADDriOpcode(IsLP64, Amount); + New = BuildMI(MF, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr).addImm(Amount); + } + } + + if (New) { + // The EFLAGS implicit def is dead. + New->getOperand(3).setIsDead(); + + // Replace the pseudo instruction with a new instruction. + MBB.insert(I, New); + } + + return; + } + + if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) { + // If we are performing frame pointer elimination and if the callee pops + // something off the stack pointer, add it back. We do this until we have + // more advanced stack pointer tracking ability. + unsigned Opc = getSUBriOpcode(IsLP64, CalleeAmt); + MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr).addImm(CalleeAmt); + + // The EFLAGS implicit def is dead. + New->getOperand(3).setIsDead(); + + // We are not tracking the stack pointer adjustment by the callee, so make + // sure we restore the stack pointer immediately after the call, there may + // be spill code inserted between the CALL and ADJCALLSTACKUP instructions. + MachineBasicBlock::iterator B = MBB.begin(); + while (I != B && !llvm::prior(I)->isCall()) + --I; + MBB.insert(I, New); + } +} + diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h new file mode 100644 index 0000000..6e309d8 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h @@ -0,0 +1,103 @@ +//===-- X86TargetFrameLowering.h - Define frame lowering for X86 -*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class implements X86-specific bits of TargetFrameLowering class. +// +//===----------------------------------------------------------------------===// + +#ifndef X86_FRAMELOWERING_H +#define X86_FRAMELOWERING_H + +#include "X86Subtarget.h" +#include "llvm/MC/MCDwarf.h" +#include "llvm/Target/TargetFrameLowering.h" + +namespace llvm { + +namespace CU { + + /// Compact unwind encoding values. + enum CompactUnwindEncodings { + /// [RE]BP based frame where [RE]BP is pused on the stack immediately after + /// the return address, then [RE]SP is moved to [RE]BP. + UNWIND_MODE_BP_FRAME = 0x01000000, + + /// A frameless function with a small constant stack size. + UNWIND_MODE_STACK_IMMD = 0x02000000, + + /// A frameless function with a large constant stack size. + UNWIND_MODE_STACK_IND = 0x03000000, + + /// No compact unwind encoding is available. + UNWIND_MODE_DWARF = 0x04000000, + + /// Mask for encoding the frame registers. + UNWIND_BP_FRAME_REGISTERS = 0x00007FFF, + + /// Mask for encoding the frameless registers. + UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF + }; + +} // end CU namespace + +class MCSymbol; +class X86TargetMachine; + +class X86FrameLowering : public TargetFrameLowering { + const X86TargetMachine &TM; + const X86Subtarget &STI; +public: + explicit X86FrameLowering(const X86TargetMachine &tm, const X86Subtarget &sti) + : TargetFrameLowering(StackGrowsDown, + sti.getStackAlignment(), + (sti.is64Bit() ? -8 : -4)), + TM(tm), STI(sti) { + } + + void emitCalleeSavedFrameMoves(MachineFunction &MF, MCSymbol *Label, + unsigned FramePtr) const; + + /// emitProlog/emitEpilog - These methods insert prolog and epilog code into + /// the function. + void emitPrologue(MachineFunction &MF) const; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + + void adjustForSegmentedStacks(MachineFunction &MF) const; + + void adjustForHiPEPrologue(MachineFunction &MF) const; + + void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS = NULL) const; + + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const; + + bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const; + + bool hasFP(const MachineFunction &MF) const; + bool hasReservedCallFrame(const MachineFunction &MF) const; + + int getFrameIndexOffset(const MachineFunction &MF, int FI) const; + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const; + uint32_t getCompactUnwindEncoding(MachineFunction &MF) const; + + void eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const; +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp new file mode 100644 index 0000000..968b358 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -0,0 +1,2720 @@ +//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines a DAG pattern matching instruction selector for X86, +// converting from a legalized dag to a X86 dag. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "x86-isel" +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86MachineFunctionInfo.h" +#include "X86RegisterInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor"); + +//===----------------------------------------------------------------------===// +// Pattern Matcher Implementation +//===----------------------------------------------------------------------===// + +namespace { + /// X86ISelAddressMode - This corresponds to X86AddressMode, but uses + /// SDValue's instead of register numbers for the leaves of the matched + /// tree. + struct X86ISelAddressMode { + enum { + RegBase, + FrameIndexBase + } BaseType; + + // This is really a union, discriminated by BaseType! + SDValue Base_Reg; + int Base_FrameIndex; + + unsigned Scale; + SDValue IndexReg; + int32_t Disp; + SDValue Segment; + const GlobalValue *GV; + const Constant *CP; + const BlockAddress *BlockAddr; + const char *ES; + int JT; + unsigned Align; // CP alignment. + unsigned char SymbolFlags; // X86II::MO_* + + X86ISelAddressMode() + : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0), + Segment(), GV(0), CP(0), BlockAddr(0), ES(0), JT(-1), Align(0), + SymbolFlags(X86II::MO_NO_FLAG) { + } + + bool hasSymbolicDisplacement() const { + return GV != 0 || CP != 0 || ES != 0 || JT != -1 || BlockAddr != 0; + } + + bool hasBaseOrIndexReg() const { + return IndexReg.getNode() != 0 || Base_Reg.getNode() != 0; + } + + /// isRIPRelative - Return true if this addressing mode is already RIP + /// relative. + bool isRIPRelative() const { + if (BaseType != RegBase) return false; + if (RegisterSDNode *RegNode = + dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode())) + return RegNode->getReg() == X86::RIP; + return false; + } + + void setBaseReg(SDValue Reg) { + BaseType = RegBase; + Base_Reg = Reg; + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump() { + dbgs() << "X86ISelAddressMode " << this << '\n'; + dbgs() << "Base_Reg "; + if (Base_Reg.getNode() != 0) + Base_Reg.getNode()->dump(); + else + dbgs() << "nul"; + dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n' + << " Scale" << Scale << '\n' + << "IndexReg "; + if (IndexReg.getNode() != 0) + IndexReg.getNode()->dump(); + else + dbgs() << "nul"; + dbgs() << " Disp " << Disp << '\n' + << "GV "; + if (GV) + GV->dump(); + else + dbgs() << "nul"; + dbgs() << " CP "; + if (CP) + CP->dump(); + else + dbgs() << "nul"; + dbgs() << '\n' + << "ES "; + if (ES) + dbgs() << ES; + else + dbgs() << "nul"; + dbgs() << " JT" << JT << " Align" << Align << '\n'; + } +#endif + }; +} + +namespace { + //===--------------------------------------------------------------------===// + /// ISel - X86 specific code to select X86 machine instructions for + /// SelectionDAG operations. + /// + class X86DAGToDAGISel : public SelectionDAGISel { + /// X86Lowering - This object fully describes how to lower LLVM code to an + /// X86-specific SelectionDAG. + const X86TargetLowering &X86Lowering; + + /// Subtarget - Keep a pointer to the X86Subtarget around so that we can + /// make the right decision when generating code for different targets. + const X86Subtarget *Subtarget; + + /// OptForSize - If true, selector should try to optimize for code size + /// instead of performance. + bool OptForSize; + + public: + explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) + : SelectionDAGISel(tm, OptLevel), + X86Lowering(*tm.getTargetLowering()), + Subtarget(&tm.getSubtarget<X86Subtarget>()), + OptForSize(false) {} + + virtual const char *getPassName() const { + return "X86 DAG->DAG Instruction Selection"; + } + + virtual void EmitFunctionEntryCode(); + + virtual bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const; + + virtual void PreprocessISelDAG(); + + inline bool immSext8(SDNode *N) const { + return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue()); + } + + // i64immSExt32 predicate - True if the 64-bit immediate fits in a 32-bit + // sign extended field. + inline bool i64immSExt32(SDNode *N) const { + uint64_t v = cast<ConstantSDNode>(N)->getZExtValue(); + return (int64_t)v == (int32_t)v; + } + +// Include the pieces autogenerated from the target description. +#include "X86GenDAGISel.inc" + + private: + SDNode *Select(SDNode *N); + SDNode *SelectGather(SDNode *N, unsigned Opc); + SDNode *SelectAtomic64(SDNode *Node, unsigned Opc); + SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT); + + bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); + bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); + bool MatchWrapper(SDValue N, X86ISelAddressMode &AM); + bool MatchAddress(SDValue N, X86ISelAddressMode &AM); + bool MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, + unsigned Depth); + bool MatchAddressBase(SDValue N, X86ISelAddressMode &AM); + bool SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, SDValue &Disp, + SDValue &Segment); + bool SelectLEAAddr(SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, SDValue &Disp, + SDValue &Segment); + bool SelectTLSADDRAddr(SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, SDValue &Disp, + SDValue &Segment); + bool SelectScalarSSELoad(SDNode *Root, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment, + SDValue &NodeWithChain); + + bool TryFoldLoad(SDNode *P, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment); + + /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for + /// inline asm expressions. + virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op, + char ConstraintCode, + std::vector<SDValue> &OutOps); + + void EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI); + + inline void getAddressOperands(X86ISelAddressMode &AM, SDValue &Base, + SDValue &Scale, SDValue &Index, + SDValue &Disp, SDValue &Segment) { + Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ? + CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, TLI.getPointerTy()) : + AM.Base_Reg; + Scale = getI8Imm(AM.Scale); + Index = AM.IndexReg; + // These are 32-bit even in 64-bit mode since RIP relative offset + // is 32-bit. + if (AM.GV) + Disp = CurDAG->getTargetGlobalAddress(AM.GV, DebugLoc(), + MVT::i32, AM.Disp, + AM.SymbolFlags); + else if (AM.CP) + Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, + AM.Align, AM.Disp, AM.SymbolFlags); + else if (AM.ES) { + assert(!AM.Disp && "Non-zero displacement is ignored with ES."); + Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags); + } else if (AM.JT != -1) { + assert(!AM.Disp && "Non-zero displacement is ignored with JT."); + Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags); + } else if (AM.BlockAddr) + Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp, + AM.SymbolFlags); + else + Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i32); + + if (AM.Segment.getNode()) + Segment = AM.Segment; + else + Segment = CurDAG->getRegister(0, MVT::i32); + } + + /// getI8Imm - Return a target constant with the specified value, of type + /// i8. + inline SDValue getI8Imm(unsigned Imm) { + return CurDAG->getTargetConstant(Imm, MVT::i8); + } + + /// getI32Imm - Return a target constant with the specified value, of type + /// i32. + inline SDValue getI32Imm(unsigned Imm) { + return CurDAG->getTargetConstant(Imm, MVT::i32); + } + + /// getGlobalBaseReg - Return an SDNode that returns the value of + /// the global base register. Output instructions required to + /// initialize the global base register, if necessary. + /// + SDNode *getGlobalBaseReg(); + + /// getTargetMachine - Return a reference to the TargetMachine, casted + /// to the target-specific type. + const X86TargetMachine &getTargetMachine() const { + return static_cast<const X86TargetMachine &>(TM); + } + + /// getInstrInfo - Return a reference to the TargetInstrInfo, casted + /// to the target-specific type. + const X86InstrInfo *getInstrInfo() const { + return getTargetMachine().getInstrInfo(); + } + }; +} + + +bool +X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { + if (OptLevel == CodeGenOpt::None) return false; + + if (!N.hasOneUse()) + return false; + + if (N.getOpcode() != ISD::LOAD) + return true; + + // If N is a load, do additional profitability checks. + if (U == Root) { + switch (U->getOpcode()) { + default: break; + case X86ISD::ADD: + case X86ISD::SUB: + case X86ISD::AND: + case X86ISD::XOR: + case X86ISD::OR: + case ISD::ADD: + case ISD::ADDC: + case ISD::ADDE: + case ISD::AND: + case ISD::OR: + case ISD::XOR: { + SDValue Op1 = U->getOperand(1); + + // If the other operand is a 8-bit immediate we should fold the immediate + // instead. This reduces code size. + // e.g. + // movl 4(%esp), %eax + // addl $4, %eax + // vs. + // movl $4, %eax + // addl 4(%esp), %eax + // The former is 2 bytes shorter. In case where the increment is 1, then + // the saving can be 4 bytes (by using incl %eax). + if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) + if (Imm->getAPIntValue().isSignedIntN(8)) + return false; + + // If the other operand is a TLS address, we should fold it instead. + // This produces + // movl %gs:0, %eax + // leal i@NTPOFF(%eax), %eax + // instead of + // movl $i@NTPOFF, %eax + // addl %gs:0, %eax + // if the block also has an access to a second TLS address this will save + // a load. + // FIXME: This is probably also true for non TLS addresses. + if (Op1.getOpcode() == X86ISD::Wrapper) { + SDValue Val = Op1.getOperand(0); + if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) + return false; + } + } + } + } + + return true; +} + +/// MoveBelowCallOrigChain - Replace the original chain operand of the call with +/// load's chain operand and move load below the call's chain operand. +static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, + SDValue Call, SDValue OrigChain) { + SmallVector<SDValue, 8> Ops; + SDValue Chain = OrigChain.getOperand(0); + if (Chain.getNode() == Load.getNode()) + Ops.push_back(Load.getOperand(0)); + else { + assert(Chain.getOpcode() == ISD::TokenFactor && + "Unexpected chain operand"); + for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) + if (Chain.getOperand(i).getNode() == Load.getNode()) + Ops.push_back(Load.getOperand(0)); + else + Ops.push_back(Chain.getOperand(i)); + SDValue NewChain = + CurDAG->getNode(ISD::TokenFactor, Load.getDebugLoc(), + MVT::Other, &Ops[0], Ops.size()); + Ops.clear(); + Ops.push_back(NewChain); + } + for (unsigned i = 1, e = OrigChain.getNumOperands(); i != e; ++i) + Ops.push_back(OrigChain.getOperand(i)); + CurDAG->UpdateNodeOperands(OrigChain.getNode(), &Ops[0], Ops.size()); + CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0), + Load.getOperand(1), Load.getOperand(2)); + + unsigned NumOps = Call.getNode()->getNumOperands(); + Ops.clear(); + Ops.push_back(SDValue(Load.getNode(), 1)); + for (unsigned i = 1, e = NumOps; i != e; ++i) + Ops.push_back(Call.getOperand(i)); + CurDAG->UpdateNodeOperands(Call.getNode(), &Ops[0], NumOps); +} + +/// isCalleeLoad - Return true if call address is a load and it can be +/// moved below CALLSEQ_START and the chains leading up to the call. +/// Return the CALLSEQ_START by reference as a second output. +/// In the case of a tail call, there isn't a callseq node between the call +/// chain and the load. +static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { + // The transformation is somewhat dangerous if the call's chain was glued to + // the call. After MoveBelowOrigChain the load is moved between the call and + // the chain, this can create a cycle if the load is not folded. So it is + // *really* important that we are sure the load will be folded. + if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse()) + return false; + LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode()); + if (!LD || + LD->isVolatile() || + LD->getAddressingMode() != ISD::UNINDEXED || + LD->getExtensionType() != ISD::NON_EXTLOAD) + return false; + + // Now let's find the callseq_start. + while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) { + if (!Chain.hasOneUse()) + return false; + Chain = Chain.getOperand(0); + } + + if (!Chain.getNumOperands()) + return false; + // Since we are not checking for AA here, conservatively abort if the chain + // writes to memory. It's not safe to move the callee (a load) across a store. + if (isa<MemSDNode>(Chain.getNode()) && + cast<MemSDNode>(Chain.getNode())->writeMem()) + return false; + if (Chain.getOperand(0).getNode() == Callee.getNode()) + return true; + if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor && + Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) && + Callee.getValue(1).hasOneUse()) + return true; + return false; +} + +void X86DAGToDAGISel::PreprocessISelDAG() { + // OptForSize is used in pattern predicates that isel is matching. + OptForSize = MF->getFunction()->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + + for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), + E = CurDAG->allnodes_end(); I != E; ) { + SDNode *N = I++; // Preincrement iterator to avoid invalidation issues. + + if (OptLevel != CodeGenOpt::None && + // Only does this when target favors doesn't favor register indirect + // call. + ((N->getOpcode() == X86ISD::CALL && !Subtarget->callRegIndirect()) || + (N->getOpcode() == X86ISD::TC_RETURN && + // Only does this if load can be folded into TC_RETURN. + (Subtarget->is64Bit() || + getTargetMachine().getRelocationModel() != Reloc::PIC_)))) { + /// Also try moving call address load from outside callseq_start to just + /// before the call to allow it to be folded. + /// + /// [Load chain] + /// ^ + /// | + /// [Load] + /// ^ ^ + /// | | + /// / \-- + /// / | + ///[CALLSEQ_START] | + /// ^ | + /// | | + /// [LOAD/C2Reg] | + /// | | + /// \ / + /// \ / + /// [CALL] + bool HasCallSeq = N->getOpcode() == X86ISD::CALL; + SDValue Chain = N->getOperand(0); + SDValue Load = N->getOperand(1); + if (!isCalleeLoad(Load, Chain, HasCallSeq)) + continue; + MoveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain); + ++NumLoadMoved; + continue; + } + + // Lower fpround and fpextend nodes that target the FP stack to be store and + // load to the stack. This is a gross hack. We would like to simply mark + // these as being illegal, but when we do that, legalize produces these when + // it expands calls, then expands these in the same legalize pass. We would + // like dag combine to be able to hack on these between the call expansion + // and the node legalization. As such this pass basically does "really + // late" legalization of these inline with the X86 isel pass. + // FIXME: This should only happen when not compiled with -O0. + if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND) + continue; + + EVT SrcVT = N->getOperand(0).getValueType(); + EVT DstVT = N->getValueType(0); + + // If any of the sources are vectors, no fp stack involved. + if (SrcVT.isVector() || DstVT.isVector()) + continue; + + // If the source and destination are SSE registers, then this is a legal + // conversion that should not be lowered. + bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT); + bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT); + if (SrcIsSSE && DstIsSSE) + continue; + + if (!SrcIsSSE && !DstIsSSE) { + // If this is an FPStack extension, it is a noop. + if (N->getOpcode() == ISD::FP_EXTEND) + continue; + // If this is a value-preserving FPStack truncation, it is a noop. + if (N->getConstantOperandVal(1)) + continue; + } + + // Here we could have an FP stack truncation or an FPStack <-> SSE convert. + // FPStack has extload and truncstore. SSE can fold direct loads into other + // operations. Based on this, decide what we want to do. + EVT MemVT; + if (N->getOpcode() == ISD::FP_ROUND) + MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'. + else + MemVT = SrcIsSSE ? SrcVT : DstVT; + + SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); + DebugLoc dl = N->getDebugLoc(); + + // FIXME: optimize the case where the src/dest is a load or store? + SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, + N->getOperand(0), + MemTmp, MachinePointerInfo(), MemVT, + false, false, 0); + SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp, + MachinePointerInfo(), + MemVT, false, false, 0); + + // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the + // extload we created. This will cause general havok on the dag because + // anything below the conversion could be folded into other existing nodes. + // To avoid invalidating 'I', back it up to the convert node. + --I; + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); + + // Now that we did that, the node is dead. Increment the iterator to the + // next node to process, then delete N. + ++I; + CurDAG->DeleteNode(N); + } +} + + +/// EmitSpecialCodeForMain - Emit any code that needs to be executed only in +/// the main function. +void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB, + MachineFrameInfo *MFI) { + const TargetInstrInfo *TII = TM.getInstrInfo(); + if (Subtarget->isTargetCygMing()) { + unsigned CallOp = + Subtarget->is64Bit() ? X86::CALL64pcrel32 : X86::CALLpcrel32; + BuildMI(BB, DebugLoc(), + TII->get(CallOp)).addExternalSymbol("__main"); + } +} + +void X86DAGToDAGISel::EmitFunctionEntryCode() { + // If this is main, emit special code for main. + if (const Function *Fn = MF->getFunction()) + if (Fn->hasExternalLinkage() && Fn->getName() == "main") + EmitSpecialCodeForMain(MF->begin(), MF->getFrameInfo()); +} + +static bool isDispSafeForFrameIndex(int64_t Val) { + // On 64-bit platforms, we can run into an issue where a frame index + // includes a displacement that, when added to the explicit displacement, + // will overflow the displacement field. Assuming that the frame index + // displacement fits into a 31-bit integer (which is only slightly more + // aggressive than the current fundamental assumption that it fits into + // a 32-bit integer), a 31-bit disp should always be safe. + return isInt<31>(Val); +} + +bool X86DAGToDAGISel::FoldOffsetIntoAddress(uint64_t Offset, + X86ISelAddressMode &AM) { + int64_t Val = AM.Disp + Offset; + CodeModel::Model M = TM.getCodeModel(); + if (Subtarget->is64Bit()) { + if (!X86::isOffsetSuitableForCodeModel(Val, M, + AM.hasSymbolicDisplacement())) + return true; + // In addition to the checks required for a register base, check that + // we do not try to use an unsafe Disp with a frame index. + if (AM.BaseType == X86ISelAddressMode::FrameIndexBase && + !isDispSafeForFrameIndex(Val)) + return true; + } + AM.Disp = Val; + return false; + +} + +bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){ + SDValue Address = N->getOperand(1); + + // load gs:0 -> GS segment register. + // load fs:0 -> FS segment register. + // + // This optimization is valid because the GNU TLS model defines that + // gs:0 (or fs:0 on X86-64) contains its own address. + // For more information see http://people.redhat.com/drepper/tls.pdf + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address)) + if (C->getSExtValue() == 0 && AM.Segment.getNode() == 0 && + Subtarget->isTargetLinux()) + switch (N->getPointerInfo().getAddrSpace()) { + case 256: + AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); + return false; + case 257: + AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); + return false; + } + + return true; +} + +/// MatchWrapper - Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes +/// into an addressing mode. These wrap things that will resolve down into a +/// symbol reference. If no match is possible, this returns true, otherwise it +/// returns false. +bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { + // If the addressing mode already has a symbol as the displacement, we can + // never match another symbol. + if (AM.hasSymbolicDisplacement()) + return true; + + SDValue N0 = N.getOperand(0); + CodeModel::Model M = TM.getCodeModel(); + + // Handle X86-64 rip-relative addresses. We check this before checking direct + // folding because RIP is preferable to non-RIP accesses. + if (Subtarget->is64Bit() && N.getOpcode() == X86ISD::WrapperRIP && + // Under X86-64 non-small code model, GV (and friends) are 64-bits, so + // they cannot be folded into immediate fields. + // FIXME: This can be improved for kernel and other models? + (M == CodeModel::Small || M == CodeModel::Kernel)) { + // Base and index reg must be 0 in order to use %rip as base. + if (AM.hasBaseOrIndexReg()) + return true; + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) { + X86ISelAddressMode Backup = AM; + AM.GV = G->getGlobal(); + AM.SymbolFlags = G->getTargetFlags(); + if (FoldOffsetIntoAddress(G->getOffset(), AM)) { + AM = Backup; + return true; + } + } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) { + X86ISelAddressMode Backup = AM; + AM.CP = CP->getConstVal(); + AM.Align = CP->getAlignment(); + AM.SymbolFlags = CP->getTargetFlags(); + if (FoldOffsetIntoAddress(CP->getOffset(), AM)) { + AM = Backup; + return true; + } + } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) { + AM.ES = S->getSymbol(); + AM.SymbolFlags = S->getTargetFlags(); + } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) { + AM.JT = J->getIndex(); + AM.SymbolFlags = J->getTargetFlags(); + } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) { + X86ISelAddressMode Backup = AM; + AM.BlockAddr = BA->getBlockAddress(); + AM.SymbolFlags = BA->getTargetFlags(); + if (FoldOffsetIntoAddress(BA->getOffset(), AM)) { + AM = Backup; + return true; + } + } else + llvm_unreachable("Unhandled symbol reference node."); + + if (N.getOpcode() == X86ISD::WrapperRIP) + AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64)); + return false; + } + + // Handle the case when globals fit in our immediate field: This is true for + // X86-32 always and X86-64 when in -mcmodel=small mode. In 64-bit + // mode, this only applies to a non-RIP-relative computation. + if (!Subtarget->is64Bit() || + M == CodeModel::Small || M == CodeModel::Kernel) { + assert(N.getOpcode() != X86ISD::WrapperRIP && + "RIP-relative addressing already handled"); + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) { + AM.GV = G->getGlobal(); + AM.Disp += G->getOffset(); + AM.SymbolFlags = G->getTargetFlags(); + } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) { + AM.CP = CP->getConstVal(); + AM.Align = CP->getAlignment(); + AM.Disp += CP->getOffset(); + AM.SymbolFlags = CP->getTargetFlags(); + } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) { + AM.ES = S->getSymbol(); + AM.SymbolFlags = S->getTargetFlags(); + } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) { + AM.JT = J->getIndex(); + AM.SymbolFlags = J->getTargetFlags(); + } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) { + AM.BlockAddr = BA->getBlockAddress(); + AM.Disp += BA->getOffset(); + AM.SymbolFlags = BA->getTargetFlags(); + } else + llvm_unreachable("Unhandled symbol reference node."); + return false; + } + + return true; +} + +/// MatchAddress - Add the specified node to the specified addressing mode, +/// returning true if it cannot be done. This just pattern matches for the +/// addressing mode. +bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) { + if (MatchAddressRecursively(N, AM, 0)) + return true; + + // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has + // a smaller encoding and avoids a scaled-index. + if (AM.Scale == 2 && + AM.BaseType == X86ISelAddressMode::RegBase && + AM.Base_Reg.getNode() == 0) { + AM.Base_Reg = AM.IndexReg; + AM.Scale = 1; + } + + // Post-processing: Convert foo to foo(%rip), even in non-PIC mode, + // because it has a smaller encoding. + // TODO: Which other code models can use this? + if (TM.getCodeModel() == CodeModel::Small && + Subtarget->is64Bit() && + AM.Scale == 1 && + AM.BaseType == X86ISelAddressMode::RegBase && + AM.Base_Reg.getNode() == 0 && + AM.IndexReg.getNode() == 0 && + AM.SymbolFlags == X86II::MO_NO_FLAG && + AM.hasSymbolicDisplacement()) + AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64); + + return false; +} + +// Insert a node into the DAG at least before the Pos node's position. This +// will reposition the node as needed, and will assign it a node ID that is <= +// the Pos node's ID. Note that this does *not* preserve the uniqueness of node +// IDs! The selection DAG must no longer depend on their uniqueness when this +// is used. +static void InsertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { + if (N.getNode()->getNodeId() == -1 || + N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) { + DAG.RepositionNode(Pos.getNode(), N.getNode()); + N.getNode()->setNodeId(Pos.getNode()->getNodeId()); + } +} + +// Transform "(X >> (8-C1)) & C2" to "(X >> 8) & 0xff)" if safe. This +// allows us to convert the shift and and into an h-register extract and +// a scaled index. Returns false if the simplification is performed. +static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, + uint64_t Mask, + SDValue Shift, SDValue X, + X86ISelAddressMode &AM) { + if (Shift.getOpcode() != ISD::SRL || + !isa<ConstantSDNode>(Shift.getOperand(1)) || + !Shift.hasOneUse()) + return true; + + int ScaleLog = 8 - Shift.getConstantOperandVal(1); + if (ScaleLog <= 0 || ScaleLog >= 4 || + Mask != (0xffu << ScaleLog)) + return true; + + EVT VT = N.getValueType(); + DebugLoc DL = N.getDebugLoc(); + SDValue Eight = DAG.getConstant(8, MVT::i8); + SDValue NewMask = DAG.getConstant(0xff, VT); + SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight); + SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask); + SDValue ShlCount = DAG.getConstant(ScaleLog, MVT::i8); + SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount); + + // Insert the new nodes into the topological ordering. We must do this in + // a valid topological ordering as nothing is going to go back and re-sort + // these nodes. We continually insert before 'N' in sequence as this is + // essentially a pre-flattened and pre-sorted sequence of nodes. There is no + // hierarchy left to express. + InsertDAGNode(DAG, N, Eight); + InsertDAGNode(DAG, N, Srl); + InsertDAGNode(DAG, N, NewMask); + InsertDAGNode(DAG, N, And); + InsertDAGNode(DAG, N, ShlCount); + InsertDAGNode(DAG, N, Shl); + DAG.ReplaceAllUsesWith(N, Shl); + AM.IndexReg = And; + AM.Scale = (1 << ScaleLog); + return false; +} + +// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this +// allows us to fold the shift into this addressing mode. Returns false if the +// transform succeeded. +static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, + uint64_t Mask, + SDValue Shift, SDValue X, + X86ISelAddressMode &AM) { + if (Shift.getOpcode() != ISD::SHL || + !isa<ConstantSDNode>(Shift.getOperand(1))) + return true; + + // Not likely to be profitable if either the AND or SHIFT node has more + // than one use (unless all uses are for address computation). Besides, + // isel mechanism requires their node ids to be reused. + if (!N.hasOneUse() || !Shift.hasOneUse()) + return true; + + // Verify that the shift amount is something we can fold. + unsigned ShiftAmt = Shift.getConstantOperandVal(1); + if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3) + return true; + + EVT VT = N.getValueType(); + DebugLoc DL = N.getDebugLoc(); + SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, VT); + SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask); + SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1)); + + // Insert the new nodes into the topological ordering. We must do this in + // a valid topological ordering as nothing is going to go back and re-sort + // these nodes. We continually insert before 'N' in sequence as this is + // essentially a pre-flattened and pre-sorted sequence of nodes. There is no + // hierarchy left to express. + InsertDAGNode(DAG, N, NewMask); + InsertDAGNode(DAG, N, NewAnd); + InsertDAGNode(DAG, N, NewShift); + DAG.ReplaceAllUsesWith(N, NewShift); + + AM.Scale = 1 << ShiftAmt; + AM.IndexReg = NewAnd; + return false; +} + +// Implement some heroics to detect shifts of masked values where the mask can +// be replaced by extending the shift and undoing that in the addressing mode +// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and +// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in +// the addressing mode. This results in code such as: +// +// int f(short *y, int *lookup_table) { +// ... +// return *y + lookup_table[*y >> 11]; +// } +// +// Turning into: +// movzwl (%rdi), %eax +// movl %eax, %ecx +// shrl $11, %ecx +// addl (%rsi,%rcx,4), %eax +// +// Instead of: +// movzwl (%rdi), %eax +// movl %eax, %ecx +// shrl $9, %ecx +// andl $124, %rcx +// addl (%rsi,%rcx), %eax +// +// Note that this function assumes the mask is provided as a mask *after* the +// value is shifted. The input chain may or may not match that, but computing +// such a mask is trivial. +static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, + uint64_t Mask, + SDValue Shift, SDValue X, + X86ISelAddressMode &AM) { + if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() || + !isa<ConstantSDNode>(Shift.getOperand(1))) + return true; + + unsigned ShiftAmt = Shift.getConstantOperandVal(1); + unsigned MaskLZ = CountLeadingZeros_64(Mask); + unsigned MaskTZ = CountTrailingZeros_64(Mask); + + // The amount of shift we're trying to fit into the addressing mode is taken + // from the trailing zeros of the mask. + unsigned AMShiftAmt = MaskTZ; + + // There is nothing we can do here unless the mask is removing some bits. + // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. + if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true; + + // We also need to ensure that mask is a continuous run of bits. + if (CountTrailingOnes_64(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true; + + // Scale the leading zero count down based on the actual size of the value. + // Also scale it down based on the size of the shift. + MaskLZ -= (64 - X.getValueSizeInBits()) + ShiftAmt; + + // The final check is to ensure that any masked out high bits of X are + // already known to be zero. Otherwise, the mask has a semantic impact + // other than masking out a couple of low bits. Unfortunately, because of + // the mask, zero extensions will be removed from operands in some cases. + // This code works extra hard to look through extensions because we can + // replace them with zero extensions cheaply if necessary. + bool ReplacingAnyExtend = false; + if (X.getOpcode() == ISD::ANY_EXTEND) { + unsigned ExtendBits = + X.getValueSizeInBits() - X.getOperand(0).getValueSizeInBits(); + // Assume that we'll replace the any-extend with a zero-extend, and + // narrow the search to the extended value. + X = X.getOperand(0); + MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits; + ReplacingAnyExtend = true; + } + APInt MaskedHighBits = APInt::getHighBitsSet(X.getValueSizeInBits(), + MaskLZ); + APInt KnownZero, KnownOne; + DAG.ComputeMaskedBits(X, KnownZero, KnownOne); + if (MaskedHighBits != KnownZero) return true; + + // We've identified a pattern that can be transformed into a single shift + // and an addressing mode. Make it so. + EVT VT = N.getValueType(); + if (ReplacingAnyExtend) { + assert(X.getValueType() != VT); + // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND. + SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, X.getDebugLoc(), VT, X); + InsertDAGNode(DAG, N, NewX); + X = NewX; + } + DebugLoc DL = N.getDebugLoc(); + SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, MVT::i8); + SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt); + SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, MVT::i8); + SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt); + + // Insert the new nodes into the topological ordering. We must do this in + // a valid topological ordering as nothing is going to go back and re-sort + // these nodes. We continually insert before 'N' in sequence as this is + // essentially a pre-flattened and pre-sorted sequence of nodes. There is no + // hierarchy left to express. + InsertDAGNode(DAG, N, NewSRLAmt); + InsertDAGNode(DAG, N, NewSRL); + InsertDAGNode(DAG, N, NewSHLAmt); + InsertDAGNode(DAG, N, NewSHL); + DAG.ReplaceAllUsesWith(N, NewSHL); + + AM.Scale = 1 << AMShiftAmt; + AM.IndexReg = NewSRL; + return false; +} + +bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, + unsigned Depth) { + DebugLoc dl = N.getDebugLoc(); + DEBUG({ + dbgs() << "MatchAddress: "; + AM.dump(); + }); + // Limit recursion. + if (Depth > 5) + return MatchAddressBase(N, AM); + + // If this is already a %rip relative address, we can only merge immediates + // into it. Instead of handling this in every case, we handle it here. + // RIP relative addressing: %rip + 32-bit displacement! + if (AM.isRIPRelative()) { + // FIXME: JumpTable and ExternalSymbol address currently don't like + // displacements. It isn't very important, but this should be fixed for + // consistency. + if (!AM.ES && AM.JT != -1) return true; + + if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N)) + if (!FoldOffsetIntoAddress(Cst->getSExtValue(), AM)) + return false; + return true; + } + + switch (N.getOpcode()) { + default: break; + case ISD::Constant: { + uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue(); + if (!FoldOffsetIntoAddress(Val, AM)) + return false; + break; + } + + case X86ISD::Wrapper: + case X86ISD::WrapperRIP: + if (!MatchWrapper(N, AM)) + return false; + break; + + case ISD::LOAD: + if (!MatchLoadInAddress(cast<LoadSDNode>(N), AM)) + return false; + break; + + case ISD::FrameIndex: + if (AM.BaseType == X86ISelAddressMode::RegBase && + AM.Base_Reg.getNode() == 0 && + (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) { + AM.BaseType = X86ISelAddressMode::FrameIndexBase; + AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex(); + return false; + } + break; + + case ISD::SHL: + if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) + break; + + if (ConstantSDNode + *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) { + unsigned Val = CN->getZExtValue(); + // Note that we handle x<<1 as (,x,2) rather than (x,x) here so + // that the base operand remains free for further matching. If + // the base doesn't end up getting used, a post-processing step + // in MatchAddress turns (,x,2) into (x,x), which is cheaper. + if (Val == 1 || Val == 2 || Val == 3) { + AM.Scale = 1 << Val; + SDValue ShVal = N.getNode()->getOperand(0); + + // Okay, we know that we have a scale by now. However, if the scaled + // value is an add of something and a constant, we can fold the + // constant into the disp field here. + if (CurDAG->isBaseWithConstantOffset(ShVal)) { + AM.IndexReg = ShVal.getNode()->getOperand(0); + ConstantSDNode *AddVal = + cast<ConstantSDNode>(ShVal.getNode()->getOperand(1)); + uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val; + if (!FoldOffsetIntoAddress(Disp, AM)) + return false; + } + + AM.IndexReg = ShVal; + return false; + } + } + break; + + case ISD::SRL: { + // Scale must not be used already. + if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break; + + SDValue And = N.getOperand(0); + if (And.getOpcode() != ISD::AND) break; + SDValue X = And.getOperand(0); + + // We only handle up to 64-bit values here as those are what matter for + // addressing mode optimizations. + if (X.getValueSizeInBits() > 64) break; + + // The mask used for the transform is expected to be post-shift, but we + // found the shift first so just apply the shift to the mask before passing + // it down. + if (!isa<ConstantSDNode>(N.getOperand(1)) || + !isa<ConstantSDNode>(And.getOperand(1))) + break; + uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1); + + // Try to fold the mask and shift into the scale, and return false if we + // succeed. + if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM)) + return false; + break; + } + + case ISD::SMUL_LOHI: + case ISD::UMUL_LOHI: + // A mul_lohi where we need the low part can be folded as a plain multiply. + if (N.getResNo() != 0) break; + // FALL THROUGH + case ISD::MUL: + case X86ISD::MUL_IMM: + // X*[3,5,9] -> X+X*[2,4,8] + if (AM.BaseType == X86ISelAddressMode::RegBase && + AM.Base_Reg.getNode() == 0 && + AM.IndexReg.getNode() == 0) { + if (ConstantSDNode + *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) + if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 || + CN->getZExtValue() == 9) { + AM.Scale = unsigned(CN->getZExtValue())-1; + + SDValue MulVal = N.getNode()->getOperand(0); + SDValue Reg; + + // Okay, we know that we have a scale by now. However, if the scaled + // value is an add of something and a constant, we can fold the + // constant into the disp field here. + if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() && + isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) { + Reg = MulVal.getNode()->getOperand(0); + ConstantSDNode *AddVal = + cast<ConstantSDNode>(MulVal.getNode()->getOperand(1)); + uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue(); + if (FoldOffsetIntoAddress(Disp, AM)) + Reg = N.getNode()->getOperand(0); + } else { + Reg = N.getNode()->getOperand(0); + } + + AM.IndexReg = AM.Base_Reg = Reg; + return false; + } + } + break; + + case ISD::SUB: { + // Given A-B, if A can be completely folded into the address and + // the index field with the index field unused, use -B as the index. + // This is a win if a has multiple parts that can be folded into + // the address. Also, this saves a mov if the base register has + // other uses, since it avoids a two-address sub instruction, however + // it costs an additional mov if the index register has other uses. + + // Add an artificial use to this node so that we can keep track of + // it if it gets CSE'd with a different node. + HandleSDNode Handle(N); + + // Test if the LHS of the sub can be folded. + X86ISelAddressMode Backup = AM; + if (MatchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) { + AM = Backup; + break; + } + // Test if the index field is free for use. + if (AM.IndexReg.getNode() || AM.isRIPRelative()) { + AM = Backup; + break; + } + + int Cost = 0; + SDValue RHS = Handle.getValue().getNode()->getOperand(1); + // If the RHS involves a register with multiple uses, this + // transformation incurs an extra mov, due to the neg instruction + // clobbering its operand. + if (!RHS.getNode()->hasOneUse() || + RHS.getNode()->getOpcode() == ISD::CopyFromReg || + RHS.getNode()->getOpcode() == ISD::TRUNCATE || + RHS.getNode()->getOpcode() == ISD::ANY_EXTEND || + (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND && + RHS.getNode()->getOperand(0).getValueType() == MVT::i32)) + ++Cost; + // If the base is a register with multiple uses, this + // transformation may save a mov. + if ((AM.BaseType == X86ISelAddressMode::RegBase && + AM.Base_Reg.getNode() && + !AM.Base_Reg.getNode()->hasOneUse()) || + AM.BaseType == X86ISelAddressMode::FrameIndexBase) + --Cost; + // If the folded LHS was interesting, this transformation saves + // address arithmetic. + if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) + + ((AM.Disp != 0) && (Backup.Disp == 0)) + + (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2) + --Cost; + // If it doesn't look like it may be an overall win, don't do it. + if (Cost >= 0) { + AM = Backup; + break; + } + + // Ok, the transformation is legal and appears profitable. Go for it. + SDValue Zero = CurDAG->getConstant(0, N.getValueType()); + SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS); + AM.IndexReg = Neg; + AM.Scale = 1; + + // Insert the new nodes into the topological ordering. + InsertDAGNode(*CurDAG, N, Zero); + InsertDAGNode(*CurDAG, N, Neg); + return false; + } + + case ISD::ADD: { + // Add an artificial use to this node so that we can keep track of + // it if it gets CSE'd with a different node. + HandleSDNode Handle(N); + + X86ISelAddressMode Backup = AM; + if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) && + !MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)) + return false; + AM = Backup; + + // Try again after commuting the operands. + if (!MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)&& + !MatchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1)) + return false; + AM = Backup; + + // If we couldn't fold both operands into the address at the same time, + // see if we can just put each operand into a register and fold at least + // the add. + if (AM.BaseType == X86ISelAddressMode::RegBase && + !AM.Base_Reg.getNode() && + !AM.IndexReg.getNode()) { + N = Handle.getValue(); + AM.Base_Reg = N.getOperand(0); + AM.IndexReg = N.getOperand(1); + AM.Scale = 1; + return false; + } + N = Handle.getValue(); + break; + } + + case ISD::OR: + // Handle "X | C" as "X + C" iff X is known to have C bits clear. + if (CurDAG->isBaseWithConstantOffset(N)) { + X86ISelAddressMode Backup = AM; + ConstantSDNode *CN = cast<ConstantSDNode>(N.getOperand(1)); + + // Start with the LHS as an addr mode. + if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) && + !FoldOffsetIntoAddress(CN->getSExtValue(), AM)) + return false; + AM = Backup; + } + break; + + case ISD::AND: { + // Perform some heroic transforms on an and of a constant-count shift + // with a constant to enable use of the scaled offset field. + + // Scale must not be used already. + if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break; + + SDValue Shift = N.getOperand(0); + if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break; + SDValue X = Shift.getOperand(0); + + // We only handle up to 64-bit values here as those are what matter for + // addressing mode optimizations. + if (X.getValueSizeInBits() > 64) break; + + if (!isa<ConstantSDNode>(N.getOperand(1))) + break; + uint64_t Mask = N.getConstantOperandVal(1); + + // Try to fold the mask and shift into an extract and scale. + if (!FoldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM)) + return false; + + // Try to fold the mask and shift directly into the scale. + if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM)) + return false; + + // Try to swap the mask and shift to place shifts which can be done as + // a scale on the outside of the mask. + if (!FoldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM)) + return false; + break; + } + } + + return MatchAddressBase(N, AM); +} + +/// MatchAddressBase - Helper for MatchAddress. Add the specified node to the +/// specified addressing mode without any further recursion. +bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) { + // Is the base register already occupied? + if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) { + // If so, check to see if the scale index register is set. + if (AM.IndexReg.getNode() == 0) { + AM.IndexReg = N; + AM.Scale = 1; + return false; + } + + // Otherwise, we cannot select it. + return true; + } + + // Default, generate it as a register. + AM.BaseType = X86ISelAddressMode::RegBase; + AM.Base_Reg = N; + return false; +} + +/// SelectAddr - returns true if it is able pattern match an addressing mode. +/// It returns the operands which make up the maximal addressing mode it can +/// match by reference. +/// +/// Parent is the parent node of the addr operand that is being matched. It +/// is always a load, store, atomic node, or null. It is only null when +/// checking memory operands for inline asm nodes. +bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, + SDValue &Disp, SDValue &Segment) { + X86ISelAddressMode AM; + + if (Parent && + // This list of opcodes are all the nodes that have an "addr:$ptr" operand + // that are not a MemSDNode, and thus don't have proper addrspace info. + Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme + Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores + Parent->getOpcode() != X86ISD::TLSCALL && // Fixme + Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp + Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp + unsigned AddrSpace = + cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace(); + // AddrSpace 256 -> GS, 257 -> FS. + if (AddrSpace == 256) + AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); + if (AddrSpace == 257) + AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); + } + + if (MatchAddress(N, AM)) + return false; + + EVT VT = N.getValueType(); + if (AM.BaseType == X86ISelAddressMode::RegBase) { + if (!AM.Base_Reg.getNode()) + AM.Base_Reg = CurDAG->getRegister(0, VT); + } + + if (!AM.IndexReg.getNode()) + AM.IndexReg = CurDAG->getRegister(0, VT); + + getAddressOperands(AM, Base, Scale, Index, Disp, Segment); + return true; +} + +/// SelectScalarSSELoad - Match a scalar SSE load. In particular, we want to +/// match a load whose top elements are either undef or zeros. The load flavor +/// is derived from the type of N, which is either v4f32 or v2f64. +/// +/// We also return: +/// PatternChainNode: this is the matched node that has a chain input and +/// output. +bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root, + SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, + SDValue &Disp, SDValue &Segment, + SDValue &PatternNodeWithChain) { + if (N.getOpcode() == ISD::SCALAR_TO_VECTOR) { + PatternNodeWithChain = N.getOperand(0); + if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) && + PatternNodeWithChain.hasOneUse() && + IsProfitableToFold(N.getOperand(0), N.getNode(), Root) && + IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) { + LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain); + if (!SelectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment)) + return false; + return true; + } + } + + // Also handle the case where we explicitly require zeros in the top + // elements. This is a vector shuffle from the zero vector. + if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() && + // Check to see if the top elements are all zeros (or bitcast of zeros). + N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && + N.getOperand(0).getNode()->hasOneUse() && + ISD::isNON_EXTLoad(N.getOperand(0).getOperand(0).getNode()) && + N.getOperand(0).getOperand(0).hasOneUse() && + IsProfitableToFold(N.getOperand(0), N.getNode(), Root) && + IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) { + // Okay, this is a zero extending load. Fold it. + LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0)); + if (!SelectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment)) + return false; + PatternNodeWithChain = SDValue(LD, 0); + return true; + } + return false; +} + + +/// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing +/// mode it matches can be cost effectively emitted as an LEA instruction. +bool X86DAGToDAGISel::SelectLEAAddr(SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment) { + X86ISelAddressMode AM; + + // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support + // segments. + SDValue Copy = AM.Segment; + SDValue T = CurDAG->getRegister(0, MVT::i32); + AM.Segment = T; + if (MatchAddress(N, AM)) + return false; + assert (T == AM.Segment); + AM.Segment = Copy; + + EVT VT = N.getValueType(); + unsigned Complexity = 0; + if (AM.BaseType == X86ISelAddressMode::RegBase) + if (AM.Base_Reg.getNode()) + Complexity = 1; + else + AM.Base_Reg = CurDAG->getRegister(0, VT); + else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) + Complexity = 4; + + if (AM.IndexReg.getNode()) + Complexity++; + else + AM.IndexReg = CurDAG->getRegister(0, VT); + + // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with + // a simple shift. + if (AM.Scale > 1) + Complexity++; + + // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA + // to a LEA. This is determined with some expermentation but is by no means + // optimal (especially for code size consideration). LEA is nice because of + // its three-address nature. Tweak the cost function again when we can run + // convertToThreeAddress() at register allocation time. + if (AM.hasSymbolicDisplacement()) { + // For X86-64, we should always use lea to materialize RIP relative + // addresses. + if (Subtarget->is64Bit()) + Complexity = 4; + else + Complexity += 2; + } + + if (AM.Disp && (AM.Base_Reg.getNode() || AM.IndexReg.getNode())) + Complexity++; + + // If it isn't worth using an LEA, reject it. + if (Complexity <= 2) + return false; + + getAddressOperands(AM, Base, Scale, Index, Disp, Segment); + return true; +} + +/// SelectTLSADDRAddr - This is only run on TargetGlobalTLSAddress nodes. +bool X86DAGToDAGISel::SelectTLSADDRAddr(SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, + SDValue &Disp, SDValue &Segment) { + assert(N.getOpcode() == ISD::TargetGlobalTLSAddress); + const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N); + + X86ISelAddressMode AM; + AM.GV = GA->getGlobal(); + AM.Disp += GA->getOffset(); + AM.Base_Reg = CurDAG->getRegister(0, N.getValueType()); + AM.SymbolFlags = GA->getTargetFlags(); + + if (N.getValueType() == MVT::i32) { + AM.Scale = 1; + AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32); + } else { + AM.IndexReg = CurDAG->getRegister(0, MVT::i64); + } + + getAddressOperands(AM, Base, Scale, Index, Disp, Segment); + return true; +} + + +bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment) { + if (!ISD::isNON_EXTLoad(N.getNode()) || + !IsProfitableToFold(N, P, P) || + !IsLegalToFold(N, P, P, OptLevel)) + return false; + + return SelectAddr(N.getNode(), + N.getOperand(1), Base, Scale, Index, Disp, Segment); +} + +/// getGlobalBaseReg - Return an SDNode that returns the value of +/// the global base register. Output instructions required to +/// initialize the global base register, if necessary. +/// +SDNode *X86DAGToDAGISel::getGlobalBaseReg() { + unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF); + return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode(); +} + +SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) { + SDValue Chain = Node->getOperand(0); + SDValue In1 = Node->getOperand(1); + SDValue In2L = Node->getOperand(2); + SDValue In2H = Node->getOperand(3); + + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (!SelectAddr(Node, In1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) + return NULL; + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemSDNode>(Node)->getMemOperand(); + const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, Chain}; + SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(), + MVT::i32, MVT::i32, MVT::Other, Ops); + cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1); + return ResNode; +} + +/// Atomic opcode table +/// +enum AtomicOpc { + ADD, + SUB, + INC, + DEC, + OR, + AND, + XOR, + AtomicOpcEnd +}; + +enum AtomicSz { + ConstantI8, + I8, + SextConstantI16, + ConstantI16, + I16, + SextConstantI32, + ConstantI32, + I32, + SextConstantI64, + ConstantI64, + I64, + AtomicSzEnd +}; + +static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = { + { + X86::LOCK_ADD8mi, + X86::LOCK_ADD8mr, + X86::LOCK_ADD16mi8, + X86::LOCK_ADD16mi, + X86::LOCK_ADD16mr, + X86::LOCK_ADD32mi8, + X86::LOCK_ADD32mi, + X86::LOCK_ADD32mr, + X86::LOCK_ADD64mi8, + X86::LOCK_ADD64mi32, + X86::LOCK_ADD64mr, + }, + { + X86::LOCK_SUB8mi, + X86::LOCK_SUB8mr, + X86::LOCK_SUB16mi8, + X86::LOCK_SUB16mi, + X86::LOCK_SUB16mr, + X86::LOCK_SUB32mi8, + X86::LOCK_SUB32mi, + X86::LOCK_SUB32mr, + X86::LOCK_SUB64mi8, + X86::LOCK_SUB64mi32, + X86::LOCK_SUB64mr, + }, + { + 0, + X86::LOCK_INC8m, + 0, + 0, + X86::LOCK_INC16m, + 0, + 0, + X86::LOCK_INC32m, + 0, + 0, + X86::LOCK_INC64m, + }, + { + 0, + X86::LOCK_DEC8m, + 0, + 0, + X86::LOCK_DEC16m, + 0, + 0, + X86::LOCK_DEC32m, + 0, + 0, + X86::LOCK_DEC64m, + }, + { + X86::LOCK_OR8mi, + X86::LOCK_OR8mr, + X86::LOCK_OR16mi8, + X86::LOCK_OR16mi, + X86::LOCK_OR16mr, + X86::LOCK_OR32mi8, + X86::LOCK_OR32mi, + X86::LOCK_OR32mr, + X86::LOCK_OR64mi8, + X86::LOCK_OR64mi32, + X86::LOCK_OR64mr, + }, + { + X86::LOCK_AND8mi, + X86::LOCK_AND8mr, + X86::LOCK_AND16mi8, + X86::LOCK_AND16mi, + X86::LOCK_AND16mr, + X86::LOCK_AND32mi8, + X86::LOCK_AND32mi, + X86::LOCK_AND32mr, + X86::LOCK_AND64mi8, + X86::LOCK_AND64mi32, + X86::LOCK_AND64mr, + }, + { + X86::LOCK_XOR8mi, + X86::LOCK_XOR8mr, + X86::LOCK_XOR16mi8, + X86::LOCK_XOR16mi, + X86::LOCK_XOR16mr, + X86::LOCK_XOR32mi8, + X86::LOCK_XOR32mi, + X86::LOCK_XOR32mr, + X86::LOCK_XOR64mi8, + X86::LOCK_XOR64mi32, + X86::LOCK_XOR64mr, + } +}; + +// Return the target constant operand for atomic-load-op and do simple +// translations, such as from atomic-load-add to lock-sub. The return value is +// one of the following 3 cases: +// + target-constant, the operand could be supported as a target constant. +// + empty, the operand is not needed any more with the new op selected. +// + non-empty, otherwise. +static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG, + DebugLoc dl, + enum AtomicOpc &Op, EVT NVT, + SDValue Val) { + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val)) { + int64_t CNVal = CN->getSExtValue(); + // Quit if not 32-bit imm. + if ((int32_t)CNVal != CNVal) + return Val; + // For atomic-load-add, we could do some optimizations. + if (Op == ADD) { + // Translate to INC/DEC if ADD by 1 or -1. + if ((CNVal == 1) || (CNVal == -1)) { + Op = (CNVal == 1) ? INC : DEC; + // No more constant operand after being translated into INC/DEC. + return SDValue(); + } + // Translate to SUB if ADD by negative value. + if (CNVal < 0) { + Op = SUB; + CNVal = -CNVal; + } + } + return CurDAG->getTargetConstant(CNVal, NVT); + } + + // If the value operand is single-used, try to optimize it. + if (Op == ADD && Val.hasOneUse()) { + // Translate (atomic-load-add ptr (sub 0 x)) back to (lock-sub x). + if (Val.getOpcode() == ISD::SUB && X86::isZeroNode(Val.getOperand(0))) { + Op = SUB; + return Val.getOperand(1); + } + // A special case for i16, which needs truncating as, in most cases, it's + // promoted to i32. We will translate + // (atomic-load-add (truncate (sub 0 x))) to (lock-sub (EXTRACT_SUBREG x)) + if (Val.getOpcode() == ISD::TRUNCATE && NVT == MVT::i16 && + Val.getOperand(0).getOpcode() == ISD::SUB && + X86::isZeroNode(Val.getOperand(0).getOperand(0))) { + Op = SUB; + Val = Val.getOperand(0); + return CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, NVT, + Val.getOperand(1)); + } + } + + return Val; +} + +SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) { + if (Node->hasAnyUseOfValue(0)) + return 0; + + DebugLoc dl = Node->getDebugLoc(); + + // Optimize common patterns for __sync_or_and_fetch and similar arith + // operations where the result is not used. This allows us to use the "lock" + // version of the arithmetic instruction. + SDValue Chain = Node->getOperand(0); + SDValue Ptr = Node->getOperand(1); + SDValue Val = Node->getOperand(2); + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) + return 0; + + // Which index into the table. + enum AtomicOpc Op; + switch (Node->getOpcode()) { + default: + return 0; + case ISD::ATOMIC_LOAD_OR: + Op = OR; + break; + case ISD::ATOMIC_LOAD_AND: + Op = AND; + break; + case ISD::ATOMIC_LOAD_XOR: + Op = XOR; + break; + case ISD::ATOMIC_LOAD_ADD: + Op = ADD; + break; + } + + Val = getAtomicLoadArithTargetConstant(CurDAG, dl, Op, NVT, Val); + bool isUnOp = !Val.getNode(); + bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant); + + unsigned Opc = 0; + switch (NVT.getSimpleVT().SimpleTy) { + default: return 0; + case MVT::i8: + if (isCN) + Opc = AtomicOpcTbl[Op][ConstantI8]; + else + Opc = AtomicOpcTbl[Op][I8]; + break; + case MVT::i16: + if (isCN) { + if (immSext8(Val.getNode())) + Opc = AtomicOpcTbl[Op][SextConstantI16]; + else + Opc = AtomicOpcTbl[Op][ConstantI16]; + } else + Opc = AtomicOpcTbl[Op][I16]; + break; + case MVT::i32: + if (isCN) { + if (immSext8(Val.getNode())) + Opc = AtomicOpcTbl[Op][SextConstantI32]; + else + Opc = AtomicOpcTbl[Op][ConstantI32]; + } else + Opc = AtomicOpcTbl[Op][I32]; + break; + case MVT::i64: + Opc = AtomicOpcTbl[Op][I64]; + if (isCN) { + if (immSext8(Val.getNode())) + Opc = AtomicOpcTbl[Op][SextConstantI64]; + else if (i64immSExt32(Val.getNode())) + Opc = AtomicOpcTbl[Op][ConstantI64]; + } + break; + } + + assert(Opc != 0 && "Invalid arith lock transform!"); + + SDValue Ret; + SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + dl, NVT), 0); + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemSDNode>(Node)->getMemOperand(); + if (isUnOp) { + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain }; + Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0); + } else { + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain }; + Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0); + } + cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1); + SDValue RetVals[] = { Undef, Ret }; + return CurDAG->getMergeValues(RetVals, 2, dl).getNode(); +} + +/// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has +/// any uses which require the SF or OF bits to be accurate. +static bool HasNoSignedComparisonUses(SDNode *N) { + // Examine each user of the node. + for (SDNode::use_iterator UI = N->use_begin(), + UE = N->use_end(); UI != UE; ++UI) { + // Only examine CopyToReg uses. + if (UI->getOpcode() != ISD::CopyToReg) + return false; + // Only examine CopyToReg uses that copy to EFLAGS. + if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != + X86::EFLAGS) + return false; + // Examine each user of the CopyToReg use. + for (SDNode::use_iterator FlagUI = UI->use_begin(), + FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) { + // Only examine the Flag result. + if (FlagUI.getUse().getResNo() != 1) continue; + // Anything unusual: assume conservatively. + if (!FlagUI->isMachineOpcode()) return false; + // Examine the opcode of the user. + switch (FlagUI->getMachineOpcode()) { + // These comparisons don't treat the most significant bit specially. + case X86::SETAr: case X86::SETAEr: case X86::SETBr: case X86::SETBEr: + case X86::SETEr: case X86::SETNEr: case X86::SETPr: case X86::SETNPr: + case X86::SETAm: case X86::SETAEm: case X86::SETBm: case X86::SETBEm: + case X86::SETEm: case X86::SETNEm: case X86::SETPm: case X86::SETNPm: + case X86::JA_4: case X86::JAE_4: case X86::JB_4: case X86::JBE_4: + case X86::JE_4: case X86::JNE_4: case X86::JP_4: case X86::JNP_4: + case X86::CMOVA16rr: case X86::CMOVA16rm: + case X86::CMOVA32rr: case X86::CMOVA32rm: + case X86::CMOVA64rr: case X86::CMOVA64rm: + case X86::CMOVAE16rr: case X86::CMOVAE16rm: + case X86::CMOVAE32rr: case X86::CMOVAE32rm: + case X86::CMOVAE64rr: case X86::CMOVAE64rm: + case X86::CMOVB16rr: case X86::CMOVB16rm: + case X86::CMOVB32rr: case X86::CMOVB32rm: + case X86::CMOVB64rr: case X86::CMOVB64rm: + case X86::CMOVBE16rr: case X86::CMOVBE16rm: + case X86::CMOVBE32rr: case X86::CMOVBE32rm: + case X86::CMOVBE64rr: case X86::CMOVBE64rm: + case X86::CMOVE16rr: case X86::CMOVE16rm: + case X86::CMOVE32rr: case X86::CMOVE32rm: + case X86::CMOVE64rr: case X86::CMOVE64rm: + case X86::CMOVNE16rr: case X86::CMOVNE16rm: + case X86::CMOVNE32rr: case X86::CMOVNE32rm: + case X86::CMOVNE64rr: case X86::CMOVNE64rm: + case X86::CMOVNP16rr: case X86::CMOVNP16rm: + case X86::CMOVNP32rr: case X86::CMOVNP32rm: + case X86::CMOVNP64rr: case X86::CMOVNP64rm: + case X86::CMOVP16rr: case X86::CMOVP16rm: + case X86::CMOVP32rr: case X86::CMOVP32rm: + case X86::CMOVP64rr: case X86::CMOVP64rm: + continue; + // Anything else: assume conservatively. + default: return false; + } + } + } + return true; +} + +/// isLoadIncOrDecStore - Check whether or not the chain ending in StoreNode +/// is suitable for doing the {load; increment or decrement; store} to modify +/// transformation. +static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc, + SDValue StoredVal, SelectionDAG *CurDAG, + LoadSDNode* &LoadNode, SDValue &InputChain) { + + // is the value stored the result of a DEC or INC? + if (!(Opc == X86ISD::DEC || Opc == X86ISD::INC)) return false; + + // is the stored value result 0 of the load? + if (StoredVal.getResNo() != 0) return false; + + // are there other uses of the loaded value than the inc or dec? + if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false; + + // is the store non-extending and non-indexed? + if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal()) + return false; + + SDValue Load = StoredVal->getOperand(0); + // Is the stored value a non-extending and non-indexed load? + if (!ISD::isNormalLoad(Load.getNode())) return false; + + // Return LoadNode by reference. + LoadNode = cast<LoadSDNode>(Load); + // is the size of the value one that we can handle? (i.e. 64, 32, 16, or 8) + EVT LdVT = LoadNode->getMemoryVT(); + if (LdVT != MVT::i64 && LdVT != MVT::i32 && LdVT != MVT::i16 && + LdVT != MVT::i8) + return false; + + // Is store the only read of the loaded value? + if (!Load.hasOneUse()) + return false; + + // Is the address of the store the same as the load? + if (LoadNode->getBasePtr() != StoreNode->getBasePtr() || + LoadNode->getOffset() != StoreNode->getOffset()) + return false; + + // Check if the chain is produced by the load or is a TokenFactor with + // the load output chain as an operand. Return InputChain by reference. + SDValue Chain = StoreNode->getChain(); + + bool ChainCheck = false; + if (Chain == Load.getValue(1)) { + ChainCheck = true; + InputChain = LoadNode->getChain(); + } else if (Chain.getOpcode() == ISD::TokenFactor) { + SmallVector<SDValue, 4> ChainOps; + for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { + SDValue Op = Chain.getOperand(i); + if (Op == Load.getValue(1)) { + ChainCheck = true; + continue; + } + + // Make sure using Op as part of the chain would not cause a cycle here. + // In theory, we could check whether the chain node is a predecessor of + // the load. But that can be very expensive. Instead visit the uses and + // make sure they all have smaller node id than the load. + int LoadId = LoadNode->getNodeId(); + for (SDNode::use_iterator UI = Op.getNode()->use_begin(), + UE = UI->use_end(); UI != UE; ++UI) { + if (UI.getUse().getResNo() != 0) + continue; + if (UI->getNodeId() > LoadId) + return false; + } + + ChainOps.push_back(Op); + } + + if (ChainCheck) + // Make a new TokenFactor with all the other input chains except + // for the load. + InputChain = CurDAG->getNode(ISD::TokenFactor, Chain.getDebugLoc(), + MVT::Other, &ChainOps[0], ChainOps.size()); + } + if (!ChainCheck) + return false; + + return true; +} + +/// getFusedLdStOpcode - Get the appropriate X86 opcode for an in memory +/// increment or decrement. Opc should be X86ISD::DEC or X86ISD::INC. +static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) { + if (Opc == X86ISD::DEC) { + if (LdVT == MVT::i64) return X86::DEC64m; + if (LdVT == MVT::i32) return X86::DEC32m; + if (LdVT == MVT::i16) return X86::DEC16m; + if (LdVT == MVT::i8) return X86::DEC8m; + } else { + assert(Opc == X86ISD::INC && "unrecognized opcode"); + if (LdVT == MVT::i64) return X86::INC64m; + if (LdVT == MVT::i32) return X86::INC32m; + if (LdVT == MVT::i16) return X86::INC16m; + if (LdVT == MVT::i8) return X86::INC8m; + } + llvm_unreachable("unrecognized size for LdVT"); +} + +/// SelectGather - Customized ISel for GATHER operations. +/// +SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) { + // Operands of Gather: VSrc, Base, VIdx, VMask, Scale + SDValue Chain = Node->getOperand(0); + SDValue VSrc = Node->getOperand(2); + SDValue Base = Node->getOperand(3); + SDValue VIdx = Node->getOperand(4); + SDValue VMask = Node->getOperand(5); + ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6)); + if (!Scale) + return 0; + + SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(), + MVT::Other); + + // Memory Operands: Base, Scale, Index, Disp, Segment + SDValue Disp = CurDAG->getTargetConstant(0, MVT::i32); + SDValue Segment = CurDAG->getRegister(0, MVT::i32); + const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue()), VIdx, + Disp, Segment, VMask, Chain}; + SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(), VTs, Ops); + // Node has 2 outputs: VDst and MVT::Other. + // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other. + // We replace VDst of Node with VDst of ResNode, and Other of Node with Other + // of ResNode. + ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0)); + ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2)); + return ResNode; +} + +SDNode *X86DAGToDAGISel::Select(SDNode *Node) { + EVT NVT = Node->getValueType(0); + unsigned Opc, MOpc; + unsigned Opcode = Node->getOpcode(); + DebugLoc dl = Node->getDebugLoc(); + + DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n'); + + if (Node->isMachineOpcode()) { + DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n'); + return NULL; // Already selected. + } + + switch (Opcode) { + default: break; + case ISD::INTRINSIC_W_CHAIN: { + unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: break; + case Intrinsic::x86_avx2_gather_d_pd: + case Intrinsic::x86_avx2_gather_d_pd_256: + case Intrinsic::x86_avx2_gather_q_pd: + case Intrinsic::x86_avx2_gather_q_pd_256: + case Intrinsic::x86_avx2_gather_d_ps: + case Intrinsic::x86_avx2_gather_d_ps_256: + case Intrinsic::x86_avx2_gather_q_ps: + case Intrinsic::x86_avx2_gather_q_ps_256: + case Intrinsic::x86_avx2_gather_d_q: + case Intrinsic::x86_avx2_gather_d_q_256: + case Intrinsic::x86_avx2_gather_q_q: + case Intrinsic::x86_avx2_gather_q_q_256: + case Intrinsic::x86_avx2_gather_d_d: + case Intrinsic::x86_avx2_gather_d_d_256: + case Intrinsic::x86_avx2_gather_q_d: + case Intrinsic::x86_avx2_gather_q_d_256: { + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); + case Intrinsic::x86_avx2_gather_d_pd: Opc = X86::VGATHERDPDrm; break; + case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break; + case Intrinsic::x86_avx2_gather_q_pd: Opc = X86::VGATHERQPDrm; break; + case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break; + case Intrinsic::x86_avx2_gather_d_ps: Opc = X86::VGATHERDPSrm; break; + case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break; + case Intrinsic::x86_avx2_gather_q_ps: Opc = X86::VGATHERQPSrm; break; + case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break; + case Intrinsic::x86_avx2_gather_d_q: Opc = X86::VPGATHERDQrm; break; + case Intrinsic::x86_avx2_gather_d_q_256: Opc = X86::VPGATHERDQYrm; break; + case Intrinsic::x86_avx2_gather_q_q: Opc = X86::VPGATHERQQrm; break; + case Intrinsic::x86_avx2_gather_q_q_256: Opc = X86::VPGATHERQQYrm; break; + case Intrinsic::x86_avx2_gather_d_d: Opc = X86::VPGATHERDDrm; break; + case Intrinsic::x86_avx2_gather_d_d_256: Opc = X86::VPGATHERDDYrm; break; + case Intrinsic::x86_avx2_gather_q_d: Opc = X86::VPGATHERQDrm; break; + case Intrinsic::x86_avx2_gather_q_d_256: Opc = X86::VPGATHERQDYrm; break; + } + SDNode *RetVal = SelectGather(Node, Opc); + if (RetVal) + // We already called ReplaceUses inside SelectGather. + return NULL; + break; + } + } + break; + } + case X86ISD::GlobalBaseReg: + return getGlobalBaseReg(); + + + case X86ISD::ATOMOR64_DAG: + case X86ISD::ATOMXOR64_DAG: + case X86ISD::ATOMADD64_DAG: + case X86ISD::ATOMSUB64_DAG: + case X86ISD::ATOMNAND64_DAG: + case X86ISD::ATOMAND64_DAG: + case X86ISD::ATOMMAX64_DAG: + case X86ISD::ATOMMIN64_DAG: + case X86ISD::ATOMUMAX64_DAG: + case X86ISD::ATOMUMIN64_DAG: + case X86ISD::ATOMSWAP64_DAG: { + unsigned Opc; + switch (Opcode) { + default: llvm_unreachable("Impossible opcode"); + case X86ISD::ATOMOR64_DAG: Opc = X86::ATOMOR6432; break; + case X86ISD::ATOMXOR64_DAG: Opc = X86::ATOMXOR6432; break; + case X86ISD::ATOMADD64_DAG: Opc = X86::ATOMADD6432; break; + case X86ISD::ATOMSUB64_DAG: Opc = X86::ATOMSUB6432; break; + case X86ISD::ATOMNAND64_DAG: Opc = X86::ATOMNAND6432; break; + case X86ISD::ATOMAND64_DAG: Opc = X86::ATOMAND6432; break; + case X86ISD::ATOMMAX64_DAG: Opc = X86::ATOMMAX6432; break; + case X86ISD::ATOMMIN64_DAG: Opc = X86::ATOMMIN6432; break; + case X86ISD::ATOMUMAX64_DAG: Opc = X86::ATOMUMAX6432; break; + case X86ISD::ATOMUMIN64_DAG: Opc = X86::ATOMUMIN6432; break; + case X86ISD::ATOMSWAP64_DAG: Opc = X86::ATOMSWAP6432; break; + } + SDNode *RetVal = SelectAtomic64(Node, Opc); + if (RetVal) + return RetVal; + break; + } + + case ISD::ATOMIC_LOAD_XOR: + case ISD::ATOMIC_LOAD_AND: + case ISD::ATOMIC_LOAD_OR: + case ISD::ATOMIC_LOAD_ADD: { + SDNode *RetVal = SelectAtomicLoadArith(Node, NVT); + if (RetVal) + return RetVal; + break; + } + case ISD::AND: + case ISD::OR: + case ISD::XOR: { + // For operations of the form (x << C1) op C2, check if we can use a smaller + // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse()) + break; + + // i8 is unshrinkable, i16 should be promoted to i32. + if (NVT != MVT::i32 && NVT != MVT::i64) + break; + + ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1); + ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1)); + if (!Cst || !ShlCst) + break; + + int64_t Val = Cst->getSExtValue(); + uint64_t ShlVal = ShlCst->getZExtValue(); + + // Make sure that we don't change the operation by removing bits. + // This only matters for OR and XOR, AND is unaffected. + uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1; + if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0) + break; + + unsigned ShlOp, Op; + EVT CstVT = NVT; + + // Check the minimum bitwidth for the new constant. + // TODO: AND32ri is the same as AND64ri32 with zext imm. + // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr + // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32. + if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal)) + CstVT = MVT::i8; + else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal)) + CstVT = MVT::i32; + + // Bail if there is no smaller encoding. + if (NVT == CstVT) + break; + + switch (NVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i32: + assert(CstVT == MVT::i8); + ShlOp = X86::SHL32ri; + + switch (Opcode) { + default: llvm_unreachable("Impossible opcode"); + case ISD::AND: Op = X86::AND32ri8; break; + case ISD::OR: Op = X86::OR32ri8; break; + case ISD::XOR: Op = X86::XOR32ri8; break; + } + break; + case MVT::i64: + assert(CstVT == MVT::i8 || CstVT == MVT::i32); + ShlOp = X86::SHL64ri; + + switch (Opcode) { + default: llvm_unreachable("Impossible opcode"); + case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break; + case ISD::OR: Op = CstVT==MVT::i8? X86::OR64ri8 : X86::OR64ri32; break; + case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break; + } + break; + } + + // Emit the smaller op and the shift. + SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, CstVT); + SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst); + return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0), + getI8Imm(ShlVal)); + } + case X86ISD::UMUL: { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + unsigned LoReg; + switch (NVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i8: LoReg = X86::AL; Opc = X86::MUL8r; break; + case MVT::i16: LoReg = X86::AX; Opc = X86::MUL16r; break; + case MVT::i32: LoReg = X86::EAX; Opc = X86::MUL32r; break; + case MVT::i64: LoReg = X86::RAX; Opc = X86::MUL64r; break; + } + + SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, + N0, SDValue()).getValue(1); + + SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::i32); + SDValue Ops[] = {N1, InFlag}; + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); + + ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); + ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1)); + ReplaceUses(SDValue(Node, 2), SDValue(CNode, 2)); + return NULL; + } + + case ISD::SMUL_LOHI: + case ISD::UMUL_LOHI: { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + bool isSigned = Opcode == ISD::SMUL_LOHI; + bool hasBMI2 = Subtarget->hasBMI2(); + if (!isSigned) { + switch (NVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i8: Opc = X86::MUL8r; MOpc = X86::MUL8m; break; + case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break; + case MVT::i32: Opc = hasBMI2 ? X86::MULX32rr : X86::MUL32r; + MOpc = hasBMI2 ? X86::MULX32rm : X86::MUL32m; break; + case MVT::i64: Opc = hasBMI2 ? X86::MULX64rr : X86::MUL64r; + MOpc = hasBMI2 ? X86::MULX64rm : X86::MUL64m; break; + } + } else { + switch (NVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i8: Opc = X86::IMUL8r; MOpc = X86::IMUL8m; break; + case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break; + case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break; + case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break; + } + } + + unsigned SrcReg, LoReg, HiReg; + switch (Opc) { + default: llvm_unreachable("Unknown MUL opcode!"); + case X86::IMUL8r: + case X86::MUL8r: + SrcReg = LoReg = X86::AL; HiReg = X86::AH; + break; + case X86::IMUL16r: + case X86::MUL16r: + SrcReg = LoReg = X86::AX; HiReg = X86::DX; + break; + case X86::IMUL32r: + case X86::MUL32r: + SrcReg = LoReg = X86::EAX; HiReg = X86::EDX; + break; + case X86::IMUL64r: + case X86::MUL64r: + SrcReg = LoReg = X86::RAX; HiReg = X86::RDX; + break; + case X86::MULX32rr: + SrcReg = X86::EDX; LoReg = HiReg = 0; + break; + case X86::MULX64rr: + SrcReg = X86::RDX; LoReg = HiReg = 0; + break; + } + + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); + // Multiply is commmutative. + if (!foldedLoad) { + foldedLoad = TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); + if (foldedLoad) + std::swap(N0, N1); + } + + SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg, + N0, SDValue()).getValue(1); + SDValue ResHi, ResLo; + + if (foldedLoad) { + SDValue Chain; + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), + InFlag }; + if (MOpc == X86::MULX32rm || MOpc == X86::MULX64rm) { + SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other, MVT::Glue); + SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); + ResHi = SDValue(CNode, 0); + ResLo = SDValue(CNode, 1); + Chain = SDValue(CNode, 2); + InFlag = SDValue(CNode, 3); + } else { + SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue); + SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); + Chain = SDValue(CNode, 0); + InFlag = SDValue(CNode, 1); + } + + // Update the chain. + ReplaceUses(N1.getValue(1), Chain); + } else { + SDValue Ops[] = { N1, InFlag }; + if (Opc == X86::MULX32rr || Opc == X86::MULX64rr) { + SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Glue); + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); + ResHi = SDValue(CNode, 0); + ResLo = SDValue(CNode, 1); + InFlag = SDValue(CNode, 2); + } else { + SDVTList VTs = CurDAG->getVTList(MVT::Glue); + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); + InFlag = SDValue(CNode, 0); + } + } + + // Prevent use of AH in a REX instruction by referencing AX instead. + if (HiReg == X86::AH && Subtarget->is64Bit() && + !SDValue(Node, 1).use_empty()) { + SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, + X86::AX, MVT::i16, InFlag); + InFlag = Result.getValue(2); + // Get the low part if needed. Don't use getCopyFromReg for aliasing + // registers. + if (!SDValue(Node, 0).use_empty()) + ReplaceUses(SDValue(Node, 1), + CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result)); + + // Shift AX down 8 bits. + Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16, + Result, + CurDAG->getTargetConstant(8, MVT::i8)), 0); + // Then truncate it down to i8. + ReplaceUses(SDValue(Node, 1), + CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result)); + } + // Copy the low half of the result, if it is needed. + if (!SDValue(Node, 0).use_empty()) { + if (ResLo.getNode() == 0) { + assert(LoReg && "Register for low half is not defined!"); + ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT, + InFlag); + InFlag = ResLo.getValue(2); + } + ReplaceUses(SDValue(Node, 0), ResLo); + DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n'); + } + // Copy the high half of the result, if it is needed. + if (!SDValue(Node, 1).use_empty()) { + if (ResHi.getNode() == 0) { + assert(HiReg && "Register for high half is not defined!"); + ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT, + InFlag); + InFlag = ResHi.getValue(2); + } + ReplaceUses(SDValue(Node, 1), ResHi); + DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n'); + } + + // Propagate ordering to the last node, for now. + CurDAG->AssignOrdering(InFlag.getNode(), CurDAG->GetOrdering(Node)); + + return NULL; + } + + case ISD::SDIVREM: + case ISD::UDIVREM: { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + bool isSigned = Opcode == ISD::SDIVREM; + if (!isSigned) { + switch (NVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break; + case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break; + case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break; + case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break; + } + } else { + switch (NVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break; + case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break; + case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break; + case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break; + } + } + + unsigned LoReg, HiReg, ClrReg; + unsigned ClrOpcode, SExtOpcode; + switch (NVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i8: + LoReg = X86::AL; ClrReg = HiReg = X86::AH; + ClrOpcode = 0; + SExtOpcode = X86::CBW; + break; + case MVT::i16: + LoReg = X86::AX; HiReg = X86::DX; + ClrOpcode = X86::MOV16r0; ClrReg = X86::DX; + SExtOpcode = X86::CWD; + break; + case MVT::i32: + LoReg = X86::EAX; ClrReg = HiReg = X86::EDX; + ClrOpcode = X86::MOV32r0; + SExtOpcode = X86::CDQ; + break; + case MVT::i64: + LoReg = X86::RAX; ClrReg = HiReg = X86::RDX; + ClrOpcode = X86::MOV64r0; + SExtOpcode = X86::CQO; + break; + } + + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); + bool signBitIsZero = CurDAG->SignBitIsZero(N0); + + SDValue InFlag; + if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) { + // Special case for div8, just use a move with zero extension to AX to + // clear the upper 8 bits (AH). + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain; + if (TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; + Move = + SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32, + MVT::Other, Ops), 0); + Chain = Move.getValue(1); + ReplaceUses(N0.getValue(1), Chain); + } else { + Move = + SDValue(CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0),0); + Chain = CurDAG->getEntryNode(); + } + Chain = CurDAG->getCopyToReg(Chain, dl, X86::EAX, Move, SDValue()); + InFlag = Chain.getValue(1); + } else { + InFlag = + CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, + LoReg, N0, SDValue()).getValue(1); + if (isSigned && !signBitIsZero) { + // Sign extend the low part into the high part. + InFlag = + SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0); + } else { + // Zero out the high part, effectively zero extending the input. + SDValue ClrNode = + SDValue(CurDAG->getMachineNode(ClrOpcode, dl, NVT), 0); + InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg, + ClrNode, InFlag).getValue(1); + } + } + + if (foldedLoad) { + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), + InFlag }; + SDNode *CNode = + CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops); + InFlag = SDValue(CNode, 1); + // Update the chain. + ReplaceUses(N1.getValue(1), SDValue(CNode, 0)); + } else { + InFlag = + SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0); + } + + // Prevent use of AH in a REX instruction by referencing AX instead. + // Shift it down 8 bits. + if (HiReg == X86::AH && Subtarget->is64Bit() && + !SDValue(Node, 1).use_empty()) { + SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, + X86::AX, MVT::i16, InFlag); + InFlag = Result.getValue(2); + + // If we also need AL (the quotient), get it by extracting a subreg from + // Result. The fast register allocator does not like multiple CopyFromReg + // nodes using aliasing registers. + if (!SDValue(Node, 0).use_empty()) + ReplaceUses(SDValue(Node, 0), + CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result)); + + // Shift AX right by 8 bits instead of using AH. + Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16, + Result, + CurDAG->getTargetConstant(8, MVT::i8)), + 0); + ReplaceUses(SDValue(Node, 1), + CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result)); + } + // Copy the division (low) result, if it is needed. + if (!SDValue(Node, 0).use_empty()) { + SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, + LoReg, NVT, InFlag); + InFlag = Result.getValue(2); + ReplaceUses(SDValue(Node, 0), Result); + DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); + } + // Copy the remainder (high) result, if it is needed. + if (!SDValue(Node, 1).use_empty()) { + SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, + HiReg, NVT, InFlag); + InFlag = Result.getValue(2); + ReplaceUses(SDValue(Node, 1), Result); + DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); + } + return NULL; + } + + case X86ISD::CMP: + case X86ISD::SUB: { + // Sometimes a SUB is used to perform comparison. + if (Opcode == X86ISD::SUB && Node->hasAnyUseOfValue(0)) + // This node is not a CMP. + break; + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to + // use a smaller encoding. + if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && + HasNoSignedComparisonUses(Node)) + // Look past the truncate if CMP is the only use of it. + N0 = N0.getOperand(0); + if ((N0.getNode()->getOpcode() == ISD::AND || + (N0.getResNo() == 0 && N0.getNode()->getOpcode() == X86ISD::AND)) && + N0.getNode()->hasOneUse() && + N0.getValueType() != MVT::i8 && + X86::isZeroNode(N1)) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getNode()->getOperand(1)); + if (!C) break; + + // For example, convert "testl %eax, $8" to "testb %al, $8" + if ((C->getZExtValue() & ~UINT64_C(0xff)) == 0 && + (!(C->getZExtValue() & 0x80) || + HasNoSignedComparisonUses(Node))) { + SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i8); + SDValue Reg = N0.getNode()->getOperand(0); + + // On x86-32, only the ABCD registers have 8-bit subregisters. + if (!Subtarget->is64Bit()) { + const TargetRegisterClass *TRC; + switch (N0.getValueType().getSimpleVT().SimpleTy) { + case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break; + case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break; + default: llvm_unreachable("Unsupported TEST operand type!"); + } + SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32); + Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl, + Reg.getValueType(), Reg, RC), 0); + } + + // Extract the l-register. + SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, + MVT::i8, Reg); + + // Emit a testb. + SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32, + Subreg, Imm); + // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has + // one, do not call ReplaceAllUsesWith. + ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), + SDValue(NewNode, 0)); + return NULL; + } + + // For example, "testl %eax, $2048" to "testb %ah, $8". + if ((C->getZExtValue() & ~UINT64_C(0xff00)) == 0 && + (!(C->getZExtValue() & 0x8000) || + HasNoSignedComparisonUses(Node))) { + // Shift the immediate right by 8 bits. + SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8, + MVT::i8); + SDValue Reg = N0.getNode()->getOperand(0); + + // Put the value in an ABCD register. + const TargetRegisterClass *TRC; + switch (N0.getValueType().getSimpleVT().SimpleTy) { + case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break; + case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break; + case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break; + default: llvm_unreachable("Unsupported TEST operand type!"); + } + SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32); + Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl, + Reg.getValueType(), Reg, RC), 0); + + // Extract the h-register. + SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, + MVT::i8, Reg); + + // Emit a testb. The EXTRACT_SUBREG becomes a COPY that can only + // target GR8_NOREX registers, so make sure the register class is + // forced. + SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl, + MVT::i32, Subreg, ShiftedImm); + // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has + // one, do not call ReplaceAllUsesWith. + ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), + SDValue(NewNode, 0)); + return NULL; + } + + // For example, "testl %eax, $32776" to "testw %ax, $32776". + if ((C->getZExtValue() & ~UINT64_C(0xffff)) == 0 && + N0.getValueType() != MVT::i16 && + (!(C->getZExtValue() & 0x8000) || + HasNoSignedComparisonUses(Node))) { + SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i16); + SDValue Reg = N0.getNode()->getOperand(0); + + // Extract the 16-bit subregister. + SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, + MVT::i16, Reg); + + // Emit a testw. + SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32, + Subreg, Imm); + // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has + // one, do not call ReplaceAllUsesWith. + ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), + SDValue(NewNode, 0)); + return NULL; + } + + // For example, "testq %rax, $268468232" to "testl %eax, $268468232". + if ((C->getZExtValue() & ~UINT64_C(0xffffffff)) == 0 && + N0.getValueType() == MVT::i64 && + (!(C->getZExtValue() & 0x80000000) || + HasNoSignedComparisonUses(Node))) { + SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32); + SDValue Reg = N0.getNode()->getOperand(0); + + // Extract the 32-bit subregister. + SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl, + MVT::i32, Reg); + + // Emit a testl. + SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32, + Subreg, Imm); + // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has + // one, do not call ReplaceAllUsesWith. + ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), + SDValue(NewNode, 0)); + return NULL; + } + } + break; + } + case ISD::STORE: { + // Change a chain of {load; incr or dec; store} of the same value into + // a simple increment or decrement through memory of that value, if the + // uses of the modified value and its address are suitable. + // The DEC64m tablegen pattern is currently not able to match the case where + // the EFLAGS on the original DEC are used. (This also applies to + // {INC,DEC}X{64,32,16,8}.) + // We'll need to improve tablegen to allow flags to be transferred from a + // node in the pattern to the result node. probably with a new keyword + // for example, we have this + // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", + // [(store (add (loadi64 addr:$dst), -1), addr:$dst), + // (implicit EFLAGS)]>; + // but maybe need something like this + // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", + // [(store (add (loadi64 addr:$dst), -1), addr:$dst), + // (transferrable EFLAGS)]>; + + StoreSDNode *StoreNode = cast<StoreSDNode>(Node); + SDValue StoredVal = StoreNode->getOperand(1); + unsigned Opc = StoredVal->getOpcode(); + + LoadSDNode *LoadNode = 0; + SDValue InputChain; + if (!isLoadIncOrDecStore(StoreNode, Opc, StoredVal, CurDAG, + LoadNode, InputChain)) + break; + + SDValue Base, Scale, Index, Disp, Segment; + if (!SelectAddr(LoadNode, LoadNode->getBasePtr(), + Base, Scale, Index, Disp, Segment)) + break; + + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2); + MemOp[0] = StoreNode->getMemOperand(); + MemOp[1] = LoadNode->getMemOperand(); + const SDValue Ops[] = { Base, Scale, Index, Disp, Segment, InputChain }; + EVT LdVT = LoadNode->getMemoryVT(); + unsigned newOpc = getFusedLdStOpcode(LdVT, Opc); + MachineSDNode *Result = CurDAG->getMachineNode(newOpc, + Node->getDebugLoc(), + MVT::i32, MVT::Other, Ops); + Result->setMemRefs(MemOp, MemOp + 2); + + ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1)); + ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0)); + + return Result; + } + } + + SDNode *ResNode = SelectCode(Node); + + DEBUG(dbgs() << "=> "; + if (ResNode == NULL || ResNode == Node) + Node->dump(CurDAG); + else + ResNode->dump(CurDAG); + dbgs() << '\n'); + + return ResNode; +} + +bool X86DAGToDAGISel:: +SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode, + std::vector<SDValue> &OutOps) { + SDValue Op0, Op1, Op2, Op3, Op4; + switch (ConstraintCode) { + case 'o': // offsetable ?? + case 'v': // not offsetable ?? + default: return true; + case 'm': // memory + if (!SelectAddr(0, Op, Op0, Op1, Op2, Op3, Op4)) + return true; + break; + } + + OutOps.push_back(Op0); + OutOps.push_back(Op1); + OutOps.push_back(Op2); + OutOps.push_back(Op3); + OutOps.push_back(Op4); + return false; +} + +/// createX86ISelDag - This pass converts a legalized DAG into a +/// X86-specific DAG, ready for instruction scheduling. +/// +FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM, + CodeGenOpt::Level OptLevel) { + return new X86DAGToDAGISel(TM, OptLevel); +} diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp new file mode 100644 index 0000000..f69f5d8 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -0,0 +1,18598 @@ +//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that X86 uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "x86-isel" +#include "X86ISelLowering.h" +#include "Utils/X86ShuffleDecode.h" +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86TargetMachine.h" +#include "X86TargetObjectFile.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/VariadicFunction.h" +#include "llvm/CodeGen/IntrinsicLowering.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Target/TargetOptions.h" +#include <bitset> +#include <cctype> +using namespace llvm; + +STATISTIC(NumTailCalls, "Number of tail calls"); + +// Forward declarations. +static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, + SDValue V2); + +/// Generate a DAG to grab 128-bits from a vector > 128 bits. This +/// sets things up to match to an AVX VEXTRACTF128 instruction or a +/// simple subregister reference. Idx is an index in the 128 bits we +/// want. It need not be aligned to a 128-bit bounday. That makes +/// lowering EXTRACT_VECTOR_ELT operations easier. +static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, DebugLoc dl) { + EVT VT = Vec.getValueType(); + assert(VT.is256BitVector() && "Unexpected vector size!"); + EVT ElVT = VT.getVectorElementType(); + unsigned Factor = VT.getSizeInBits()/128; + EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, + VT.getVectorNumElements()/Factor); + + // Extract from UNDEF is UNDEF. + if (Vec.getOpcode() == ISD::UNDEF) + return DAG.getUNDEF(ResultVT); + + // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR + // we can match to VEXTRACTF128. + unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); + + // This is the index of the first element of the 128-bit chunk + // we want. + unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) + * ElemsPerChunk); + + // If the input is a buildvector just emit a smaller one. + if (Vec.getOpcode() == ISD::BUILD_VECTOR) + return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, + Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk); + + SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); + SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, + VecIdx); + + return Result; +} + +/// Generate a DAG to put 128-bits into a vector > 128 bits. This +/// sets things up to match to an AVX VINSERTF128 instruction or a +/// simple superregister reference. Idx is an index in the 128 bits +/// we want. It need not be aligned to a 128-bit bounday. That makes +/// lowering INSERT_VECTOR_ELT operations easier. +static SDValue Insert128BitVector(SDValue Result, SDValue Vec, + unsigned IdxVal, SelectionDAG &DAG, + DebugLoc dl) { + // Inserting UNDEF is Result + if (Vec.getOpcode() == ISD::UNDEF) + return Result; + + EVT VT = Vec.getValueType(); + assert(VT.is128BitVector() && "Unexpected vector size!"); + + EVT ElVT = VT.getVectorElementType(); + EVT ResultVT = Result.getValueType(); + + // Insert the relevant 128 bits. + unsigned ElemsPerChunk = 128/ElVT.getSizeInBits(); + + // This is the index of the first element of the 128-bit chunk + // we want. + unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) + * ElemsPerChunk); + + SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, + VecIdx); +} + +/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 +/// instructions. This is used because creating CONCAT_VECTOR nodes of +/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower +/// large BUILD_VECTORS. +static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, + unsigned NumElems, SelectionDAG &DAG, + DebugLoc dl) { + SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); + return Insert128BitVector(V, V2, NumElems/2, DAG, dl); +} + +static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { + const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); + bool is64Bit = Subtarget->is64Bit(); + + if (Subtarget->isTargetEnvMacho()) { + if (is64Bit) + return new X86_64MachoTargetObjectFile(); + return new TargetLoweringObjectFileMachO(); + } + + if (Subtarget->isTargetLinux()) + return new X86LinuxTargetObjectFile(); + if (Subtarget->isTargetELF()) + return new TargetLoweringObjectFileELF(); + if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) + return new TargetLoweringObjectFileCOFF(); + llvm_unreachable("unknown subtarget type"); +} + +X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) + : TargetLowering(TM, createTLOF(TM)) { + Subtarget = &TM.getSubtarget<X86Subtarget>(); + X86ScalarSSEf64 = Subtarget->hasSSE2(); + X86ScalarSSEf32 = Subtarget->hasSSE1(); + RegInfo = TM.getRegisterInfo(); + TD = getDataLayout(); + + resetOperationActions(); +} + +void X86TargetLowering::resetOperationActions() { + const TargetMachine &TM = getTargetMachine(); + static bool FirstTimeThrough = true; + + // If none of the target options have changed, then we don't need to reset the + // operation actions. + if (!FirstTimeThrough && TO == TM.Options) return; + + if (!FirstTimeThrough) { + // Reinitialize the actions. + initActions(); + FirstTimeThrough = false; + } + + TO = TM.Options; + + // Set up the TargetLowering object. + static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; + + // X86 is weird, it always uses i8 for shift amounts and setcc results. + setBooleanContents(ZeroOrOneBooleanContent); + // X86-SSE is even stranger. It uses -1 or 0 for vector masks. + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); + + // For 64-bit since we have so many registers use the ILP scheduler, for + // 32-bit code use the register pressure specific scheduling. + // For Atom, always use ILP scheduling. + if (Subtarget->isAtom()) + setSchedulingPreference(Sched::ILP); + else if (Subtarget->is64Bit()) + setSchedulingPreference(Sched::ILP); + else + setSchedulingPreference(Sched::RegPressure); + setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); + + // Bypass expensive divides on Atom when compiling with O2 + if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) { + addBypassSlowDiv(32, 8); + if (Subtarget->is64Bit()) + addBypassSlowDiv(64, 16); + } + + if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { + // Setup Windows compiler runtime calls. + setLibcallName(RTLIB::SDIV_I64, "_alldiv"); + setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); + setLibcallName(RTLIB::SREM_I64, "_allrem"); + setLibcallName(RTLIB::UREM_I64, "_aullrem"); + setLibcallName(RTLIB::MUL_I64, "_allmul"); + setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); + setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); + setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); + setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); + setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); + + // The _ftol2 runtime function has an unusual calling conv, which + // is modeled by a special pseudo-instruction. + setLibcallName(RTLIB::FPTOUINT_F64_I64, 0); + setLibcallName(RTLIB::FPTOUINT_F32_I64, 0); + setLibcallName(RTLIB::FPTOUINT_F64_I32, 0); + setLibcallName(RTLIB::FPTOUINT_F32_I32, 0); + } + + if (Subtarget->isTargetDarwin()) { + // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. + setUseUnderscoreSetJmp(false); + setUseUnderscoreLongJmp(false); + } else if (Subtarget->isTargetMingw()) { + // MS runtime is weird: it exports _setjmp, but longjmp! + setUseUnderscoreSetJmp(true); + setUseUnderscoreLongJmp(false); + } else { + setUseUnderscoreSetJmp(true); + setUseUnderscoreLongJmp(true); + } + + // Set up the register classes. + addRegisterClass(MVT::i8, &X86::GR8RegClass); + addRegisterClass(MVT::i16, &X86::GR16RegClass); + addRegisterClass(MVT::i32, &X86::GR32RegClass); + if (Subtarget->is64Bit()) + addRegisterClass(MVT::i64, &X86::GR64RegClass); + + setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); + + // We don't accept any truncstore of integer registers. + setTruncStoreAction(MVT::i64, MVT::i32, Expand); + setTruncStoreAction(MVT::i64, MVT::i16, Expand); + setTruncStoreAction(MVT::i64, MVT::i8 , Expand); + setTruncStoreAction(MVT::i32, MVT::i16, Expand); + setTruncStoreAction(MVT::i32, MVT::i8 , Expand); + setTruncStoreAction(MVT::i16, MVT::i8, Expand); + + // SETOEQ and SETUNE require checking two conditions. + setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); + setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); + setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); + setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); + setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); + setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); + + // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this + // operation. + setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); + setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); + setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); + + if (Subtarget->is64Bit()) { + setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); + setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); + } else if (!TM.Options.UseSoftFloat) { + // We have an algorithm for SSE2->double, and we turn this into a + // 64-bit FILD followed by conditional FADD for other targets. + setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); + // We have an algorithm for SSE2, and we turn this into a 64-bit + // FILD for other targets. + setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); + } + + // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have + // this operation. + setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); + setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); + + if (!TM.Options.UseSoftFloat) { + // SSE has no i16 to fp conversion, only i32 + if (X86ScalarSSEf32) { + setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); + // f32 and f64 cases are Legal, f80 case is not + setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); + } else { + setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); + setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); + } + } else { + setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); + setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); + } + + // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 + // are Legal, f80 is custom lowered. + setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); + setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); + + // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have + // this operation. + setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); + setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); + + if (X86ScalarSSEf32) { + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); + // f32 and f64 cases are Legal, f80 case is not + setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); + } else { + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); + setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); + } + + // Handle FP_TO_UINT by promoting the destination to a larger signed + // conversion. + setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); + setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); + setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); + + if (Subtarget->is64Bit()) { + setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); + setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); + } else if (!TM.Options.UseSoftFloat) { + // Since AVX is a superset of SSE3, only check for SSE here. + if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) + // Expand FP_TO_UINT into a select. + // FIXME: We would like to use a Custom expander here eventually to do + // the optimal thing for SSE vs. the default expansion in the legalizer. + setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); + else + // With SSE3 we can use fisttpll to convert to a signed i64; without + // SSE, we're stuck with a fistpll. + setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); + } + + if (isTargetFTOL()) { + // Use the _ftol2 runtime function, which has a pseudo-instruction + // to handle its weird calling convention. + setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); + } + + // TODO: when we have SSE, these could be more efficient, by using movd/movq. + if (!X86ScalarSSEf64) { + setOperationAction(ISD::BITCAST , MVT::f32 , Expand); + setOperationAction(ISD::BITCAST , MVT::i32 , Expand); + if (Subtarget->is64Bit()) { + setOperationAction(ISD::BITCAST , MVT::f64 , Expand); + // Without SSE, i64->f64 goes through memory. + setOperationAction(ISD::BITCAST , MVT::i64 , Expand); + } + } + + // Scalar integer divide and remainder are lowered to use operations that + // produce two results, to match the available instructions. This exposes + // the two-result form to trivial CSE, which is able to combine x/y and x%y + // into a single instruction. + // + // Scalar integer multiply-high is also lowered to use two-result + // operations, to match the available instructions. However, plain multiply + // (low) operations are left as Legal, as there are single-result + // instructions for this in x86. Using the two-result multiply instructions + // when both high and low results are needed must be arranged by dagcombine. + for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { + MVT VT = IntVTs[i]; + setOperationAction(ISD::MULHS, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::UREM, VT, Expand); + + // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. + setOperationAction(ISD::ADDC, VT, Custom); + setOperationAction(ISD::ADDE, VT, Custom); + setOperationAction(ISD::SUBC, VT, Custom); + setOperationAction(ISD::SUBE, VT, Custom); + } + + setOperationAction(ISD::BR_JT , MVT::Other, Expand); + setOperationAction(ISD::BRCOND , MVT::Other, Custom); + setOperationAction(ISD::BR_CC , MVT::f32, Expand); + setOperationAction(ISD::BR_CC , MVT::f64, Expand); + setOperationAction(ISD::BR_CC , MVT::f80, Expand); + setOperationAction(ISD::BR_CC , MVT::i8, Expand); + setOperationAction(ISD::BR_CC , MVT::i16, Expand); + setOperationAction(ISD::BR_CC , MVT::i32, Expand); + setOperationAction(ISD::BR_CC , MVT::i64, Expand); + setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); + if (Subtarget->is64Bit()) + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); + setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); + setOperationAction(ISD::FREM , MVT::f32 , Expand); + setOperationAction(ISD::FREM , MVT::f64 , Expand); + setOperationAction(ISD::FREM , MVT::f80 , Expand); + setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); + + // Promote the i8 variants and force them on up to i32 which has a shorter + // encoding. + setOperationAction(ISD::CTTZ , MVT::i8 , Promote); + AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32); + setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote); + AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32); + if (Subtarget->hasBMI()) { + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand); + if (Subtarget->is64Bit()) + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); + } else { + setOperationAction(ISD::CTTZ , MVT::i16 , Custom); + setOperationAction(ISD::CTTZ , MVT::i32 , Custom); + if (Subtarget->is64Bit()) + setOperationAction(ISD::CTTZ , MVT::i64 , Custom); + } + + if (Subtarget->hasLZCNT()) { + // When promoting the i8 variants, force them to i32 for a shorter + // encoding. + setOperationAction(ISD::CTLZ , MVT::i8 , Promote); + AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote); + AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand); + if (Subtarget->is64Bit()) + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); + } else { + setOperationAction(ISD::CTLZ , MVT::i8 , Custom); + setOperationAction(ISD::CTLZ , MVT::i16 , Custom); + setOperationAction(ISD::CTLZ , MVT::i32 , Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); + if (Subtarget->is64Bit()) { + setOperationAction(ISD::CTLZ , MVT::i64 , Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); + } + } + + if (Subtarget->hasPOPCNT()) { + setOperationAction(ISD::CTPOP , MVT::i8 , Promote); + } else { + setOperationAction(ISD::CTPOP , MVT::i8 , Expand); + setOperationAction(ISD::CTPOP , MVT::i16 , Expand); + setOperationAction(ISD::CTPOP , MVT::i32 , Expand); + if (Subtarget->is64Bit()) + setOperationAction(ISD::CTPOP , MVT::i64 , Expand); + } + + setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); + setOperationAction(ISD::BSWAP , MVT::i16 , Expand); + + // These should be promoted to a larger select which is supported. + setOperationAction(ISD::SELECT , MVT::i1 , Promote); + // X86 wants to expand cmov itself. + setOperationAction(ISD::SELECT , MVT::i8 , Custom); + setOperationAction(ISD::SELECT , MVT::i16 , Custom); + setOperationAction(ISD::SELECT , MVT::i32 , Custom); + setOperationAction(ISD::SELECT , MVT::f32 , Custom); + setOperationAction(ISD::SELECT , MVT::f64 , Custom); + setOperationAction(ISD::SELECT , MVT::f80 , Custom); + setOperationAction(ISD::SETCC , MVT::i8 , Custom); + setOperationAction(ISD::SETCC , MVT::i16 , Custom); + setOperationAction(ISD::SETCC , MVT::i32 , Custom); + setOperationAction(ISD::SETCC , MVT::f32 , Custom); + setOperationAction(ISD::SETCC , MVT::f64 , Custom); + setOperationAction(ISD::SETCC , MVT::f80 , Custom); + if (Subtarget->is64Bit()) { + setOperationAction(ISD::SELECT , MVT::i64 , Custom); + setOperationAction(ISD::SETCC , MVT::i64 , Custom); + } + setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); + // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support + // SjLj exception handling but a light-weight setjmp/longjmp replacement to + // support continuation, user-level threading, and etc.. As a result, no + // other SjLj exception interfaces are implemented and please don't build + // your own exception handling based on them. + // LLVM/Clang supports zero-cost DWARF exception handling. + setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); + setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); + + // Darwin ABI issue. + setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); + setOperationAction(ISD::JumpTable , MVT::i32 , Custom); + setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); + setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); + if (Subtarget->is64Bit()) + setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); + setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); + setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); + if (Subtarget->is64Bit()) { + setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); + setOperationAction(ISD::JumpTable , MVT::i64 , Custom); + setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); + setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); + setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); + } + // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) + setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); + setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); + setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); + if (Subtarget->is64Bit()) { + setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); + setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); + setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); + } + + if (Subtarget->hasSSE1()) + setOperationAction(ISD::PREFETCH , MVT::Other, Legal); + + setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); + + // Expand certain atomics + for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { + MVT VT = IntVTs[i]; + setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); + setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); + setOperationAction(ISD::ATOMIC_STORE, VT, Custom); + } + + if (!Subtarget->is64Bit()) { + setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom); + } + + if (Subtarget->hasCmpxchg16b()) { + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); + } + + // FIXME - use subtarget debug flags + if (!Subtarget->isTargetDarwin() && + !Subtarget->isTargetELF() && + !Subtarget->isTargetCygMing()) { + setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); + } + + setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); + setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); + setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); + setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); + if (Subtarget->is64Bit()) { + setExceptionPointerRegister(X86::RAX); + setExceptionSelectorRegister(X86::RDX); + } else { + setExceptionPointerRegister(X86::EAX); + setExceptionSelectorRegister(X86::EDX); + } + setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); + setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); + + setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); + setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); + + setOperationAction(ISD::TRAP, MVT::Other, Legal); + setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); + + // VASTART needs to be custom lowered to use the VarArgsFrameIndex + setOperationAction(ISD::VASTART , MVT::Other, Custom); + setOperationAction(ISD::VAEND , MVT::Other, Expand); + if (Subtarget->is64Bit()) { + setOperationAction(ISD::VAARG , MVT::Other, Custom); + setOperationAction(ISD::VACOPY , MVT::Other, Custom); + } else { + setOperationAction(ISD::VAARG , MVT::Other, Expand); + setOperationAction(ISD::VACOPY , MVT::Other, Expand); + } + + setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); + + if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) + setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? + MVT::i64 : MVT::i32, Custom); + else if (TM.Options.EnableSegmentedStacks) + setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? + MVT::i64 : MVT::i32, Custom); + else + setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? + MVT::i64 : MVT::i32, Expand); + + if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { + // f32 and f64 use SSE. + // Set up the FP register classes. + addRegisterClass(MVT::f32, &X86::FR32RegClass); + addRegisterClass(MVT::f64, &X86::FR64RegClass); + + // Use ANDPD to simulate FABS. + setOperationAction(ISD::FABS , MVT::f64, Custom); + setOperationAction(ISD::FABS , MVT::f32, Custom); + + // Use XORP to simulate FNEG. + setOperationAction(ISD::FNEG , MVT::f64, Custom); + setOperationAction(ISD::FNEG , MVT::f32, Custom); + + // Use ANDPD and ORPD to simulate FCOPYSIGN. + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); + + // Lower this to FGETSIGNx86 plus an AND. + setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); + setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); + + // We don't support sin/cos/fmod + setOperationAction(ISD::FSIN , MVT::f64, Expand); + setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSIN , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); + + // Expand FP immediates into loads from the stack, except for the special + // cases we handle. + addLegalFPImmediate(APFloat(+0.0)); // xorpd + addLegalFPImmediate(APFloat(+0.0f)); // xorps + } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) { + // Use SSE for f32, x87 for f64. + // Set up the FP register classes. + addRegisterClass(MVT::f32, &X86::FR32RegClass); + addRegisterClass(MVT::f64, &X86::RFP64RegClass); + + // Use ANDPS to simulate FABS. + setOperationAction(ISD::FABS , MVT::f32, Custom); + + // Use XORP to simulate FNEG. + setOperationAction(ISD::FNEG , MVT::f32, Custom); + + setOperationAction(ISD::UNDEF, MVT::f64, Expand); + + // Use ANDPS and ORPS to simulate FCOPYSIGN. + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); + + // We don't support sin/cos/fmod + setOperationAction(ISD::FSIN , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); + + // Special cases we handle for FP constants. + addLegalFPImmediate(APFloat(+0.0f)); // xorps + addLegalFPImmediate(APFloat(+0.0)); // FLD0 + addLegalFPImmediate(APFloat(+1.0)); // FLD1 + addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS + addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS + + if (!TM.Options.UnsafeFPMath) { + setOperationAction(ISD::FSIN , MVT::f64, Expand); + setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + } + } else if (!TM.Options.UseSoftFloat) { + // f32 and f64 in x87. + // Set up the FP register classes. + addRegisterClass(MVT::f64, &X86::RFP64RegClass); + addRegisterClass(MVT::f32, &X86::RFP32RegClass); + + setOperationAction(ISD::UNDEF, MVT::f64, Expand); + setOperationAction(ISD::UNDEF, MVT::f32, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + + if (!TM.Options.UnsafeFPMath) { + setOperationAction(ISD::FSIN , MVT::f64, Expand); + setOperationAction(ISD::FSIN , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); + } + addLegalFPImmediate(APFloat(+0.0)); // FLD0 + addLegalFPImmediate(APFloat(+1.0)); // FLD1 + addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS + addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS + addLegalFPImmediate(APFloat(+0.0f)); // FLD0 + addLegalFPImmediate(APFloat(+1.0f)); // FLD1 + addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS + addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS + } + + // We don't support FMA. + setOperationAction(ISD::FMA, MVT::f64, Expand); + setOperationAction(ISD::FMA, MVT::f32, Expand); + + // Long double always uses X87. + if (!TM.Options.UseSoftFloat) { + addRegisterClass(MVT::f80, &X86::RFP80RegClass); + setOperationAction(ISD::UNDEF, MVT::f80, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); + { + APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); + addLegalFPImmediate(TmpFlt); // FLD0 + TmpFlt.changeSign(); + addLegalFPImmediate(TmpFlt); // FLD0/FCHS + + bool ignored; + APFloat TmpFlt2(+1.0); + TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, + &ignored); + addLegalFPImmediate(TmpFlt2); // FLD1 + TmpFlt2.changeSign(); + addLegalFPImmediate(TmpFlt2); // FLD1/FCHS + } + + if (!TM.Options.UnsafeFPMath) { + setOperationAction(ISD::FSIN , MVT::f80, Expand); + setOperationAction(ISD::FCOS , MVT::f80, Expand); + setOperationAction(ISD::FSINCOS, MVT::f80, Expand); + } + + setOperationAction(ISD::FFLOOR, MVT::f80, Expand); + setOperationAction(ISD::FCEIL, MVT::f80, Expand); + setOperationAction(ISD::FTRUNC, MVT::f80, Expand); + setOperationAction(ISD::FRINT, MVT::f80, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); + setOperationAction(ISD::FMA, MVT::f80, Expand); + } + + // Always use a library call for pow. + setOperationAction(ISD::FPOW , MVT::f32 , Expand); + setOperationAction(ISD::FPOW , MVT::f64 , Expand); + setOperationAction(ISD::FPOW , MVT::f80 , Expand); + + setOperationAction(ISD::FLOG, MVT::f80, Expand); + setOperationAction(ISD::FLOG2, MVT::f80, Expand); + setOperationAction(ISD::FLOG10, MVT::f80, Expand); + setOperationAction(ISD::FEXP, MVT::f80, Expand); + setOperationAction(ISD::FEXP2, MVT::f80, Expand); + + // First set operation action for all vector types to either promote + // (for widening) or expand (for scalarization). Then we will selectively + // turn on ones that can be effectively codegen'd. + for (int i = MVT::FIRST_VECTOR_VALUETYPE; + i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { + MVT VT = (MVT::SimpleValueType)i; + setOperationAction(ISD::ADD , VT, Expand); + setOperationAction(ISD::SUB , VT, Expand); + setOperationAction(ISD::FADD, VT, Expand); + setOperationAction(ISD::FNEG, VT, Expand); + setOperationAction(ISD::FSUB, VT, Expand); + setOperationAction(ISD::MUL , VT, Expand); + setOperationAction(ISD::FMUL, VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::FDIV, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::LOAD, VT, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); + setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); + setOperationAction(ISD::FABS, VT, Expand); + setOperationAction(ISD::FSIN, VT, Expand); + setOperationAction(ISD::FSINCOS, VT, Expand); + setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FSINCOS, VT, Expand); + setOperationAction(ISD::FREM, VT, Expand); + setOperationAction(ISD::FMA, VT, Expand); + setOperationAction(ISD::FPOWI, VT, Expand); + setOperationAction(ISD::FSQRT, VT, Expand); + setOperationAction(ISD::FCOPYSIGN, VT, Expand); + setOperationAction(ISD::FFLOOR, VT, Expand); + setOperationAction(ISD::FCEIL, VT, Expand); + setOperationAction(ISD::FTRUNC, VT, Expand); + setOperationAction(ISD::FRINT, VT, Expand); + setOperationAction(ISD::FNEARBYINT, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::UDIVREM, VT, Expand); + setOperationAction(ISD::FPOW, VT, Expand); + setOperationAction(ISD::CTPOP, VT, Expand); + setOperationAction(ISD::CTTZ, VT, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::CTLZ, VT, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::SHL, VT, Expand); + setOperationAction(ISD::SRA, VT, Expand); + setOperationAction(ISD::SRL, VT, Expand); + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); + setOperationAction(ISD::BSWAP, VT, Expand); + setOperationAction(ISD::SETCC, VT, Expand); + setOperationAction(ISD::FLOG, VT, Expand); + setOperationAction(ISD::FLOG2, VT, Expand); + setOperationAction(ISD::FLOG10, VT, Expand); + setOperationAction(ISD::FEXP, VT, Expand); + setOperationAction(ISD::FEXP2, VT, Expand); + setOperationAction(ISD::FP_TO_UINT, VT, Expand); + setOperationAction(ISD::FP_TO_SINT, VT, Expand); + setOperationAction(ISD::UINT_TO_FP, VT, Expand); + setOperationAction(ISD::SINT_TO_FP, VT, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); + setOperationAction(ISD::TRUNCATE, VT, Expand); + setOperationAction(ISD::SIGN_EXTEND, VT, Expand); + setOperationAction(ISD::ZERO_EXTEND, VT, Expand); + setOperationAction(ISD::ANY_EXTEND, VT, Expand); + setOperationAction(ISD::VSELECT, VT, Expand); + for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE; + InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) + setTruncStoreAction(VT, + (MVT::SimpleValueType)InnerVT, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, Expand); + } + + // FIXME: In order to prevent SSE instructions being expanded to MMX ones + // with -msoft-float, disable use of MMX as well. + if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) { + addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); + // No operations on x86mmx supported, everything uses intrinsics. + } + + // MMX-sized vectors (other than x86mmx) are expected to be expanded + // into smaller operations. + setOperationAction(ISD::MULHS, MVT::v8i8, Expand); + setOperationAction(ISD::MULHS, MVT::v4i16, Expand); + setOperationAction(ISD::MULHS, MVT::v2i32, Expand); + setOperationAction(ISD::MULHS, MVT::v1i64, Expand); + setOperationAction(ISD::AND, MVT::v8i8, Expand); + setOperationAction(ISD::AND, MVT::v4i16, Expand); + setOperationAction(ISD::AND, MVT::v2i32, Expand); + setOperationAction(ISD::AND, MVT::v1i64, Expand); + setOperationAction(ISD::OR, MVT::v8i8, Expand); + setOperationAction(ISD::OR, MVT::v4i16, Expand); + setOperationAction(ISD::OR, MVT::v2i32, Expand); + setOperationAction(ISD::OR, MVT::v1i64, Expand); + setOperationAction(ISD::XOR, MVT::v8i8, Expand); + setOperationAction(ISD::XOR, MVT::v4i16, Expand); + setOperationAction(ISD::XOR, MVT::v2i32, Expand); + setOperationAction(ISD::XOR, MVT::v1i64, Expand); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); + setOperationAction(ISD::SELECT, MVT::v8i8, Expand); + setOperationAction(ISD::SELECT, MVT::v4i16, Expand); + setOperationAction(ISD::SELECT, MVT::v2i32, Expand); + setOperationAction(ISD::SELECT, MVT::v1i64, Expand); + setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); + setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); + setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); + setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); + + if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) { + addRegisterClass(MVT::v4f32, &X86::VR128RegClass); + + setOperationAction(ISD::FADD, MVT::v4f32, Legal); + setOperationAction(ISD::FSUB, MVT::v4f32, Legal); + setOperationAction(ISD::FMUL, MVT::v4f32, Legal); + setOperationAction(ISD::FDIV, MVT::v4f32, Legal); + setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); + setOperationAction(ISD::FNEG, MVT::v4f32, Custom); + setOperationAction(ISD::FABS, MVT::v4f32, Custom); + setOperationAction(ISD::LOAD, MVT::v4f32, Legal); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); + setOperationAction(ISD::SELECT, MVT::v4f32, Custom); + } + + if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { + addRegisterClass(MVT::v2f64, &X86::VR128RegClass); + + // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM + // registers cannot be used even for integer operations. + addRegisterClass(MVT::v16i8, &X86::VR128RegClass); + addRegisterClass(MVT::v8i16, &X86::VR128RegClass); + addRegisterClass(MVT::v4i32, &X86::VR128RegClass); + addRegisterClass(MVT::v2i64, &X86::VR128RegClass); + + setOperationAction(ISD::ADD, MVT::v16i8, Legal); + setOperationAction(ISD::ADD, MVT::v8i16, Legal); + setOperationAction(ISD::ADD, MVT::v4i32, Legal); + setOperationAction(ISD::ADD, MVT::v2i64, Legal); + setOperationAction(ISD::MUL, MVT::v4i32, Custom); + setOperationAction(ISD::MUL, MVT::v2i64, Custom); + setOperationAction(ISD::SUB, MVT::v16i8, Legal); + setOperationAction(ISD::SUB, MVT::v8i16, Legal); + setOperationAction(ISD::SUB, MVT::v4i32, Legal); + setOperationAction(ISD::SUB, MVT::v2i64, Legal); + setOperationAction(ISD::MUL, MVT::v8i16, Legal); + setOperationAction(ISD::FADD, MVT::v2f64, Legal); + setOperationAction(ISD::FSUB, MVT::v2f64, Legal); + setOperationAction(ISD::FMUL, MVT::v2f64, Legal); + setOperationAction(ISD::FDIV, MVT::v2f64, Legal); + setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); + setOperationAction(ISD::FNEG, MVT::v2f64, Custom); + setOperationAction(ISD::FABS, MVT::v2f64, Custom); + + setOperationAction(ISD::SETCC, MVT::v2i64, Custom); + setOperationAction(ISD::SETCC, MVT::v16i8, Custom); + setOperationAction(ISD::SETCC, MVT::v8i16, Custom); + setOperationAction(ISD::SETCC, MVT::v4i32, Custom); + + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + + // Custom lower build_vector, vector_shuffle, and extract_vector_elt. + for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { + MVT VT = (MVT::SimpleValueType)i; + // Do not attempt to custom lower non-power-of-2 vectors + if (!isPowerOf2_32(VT.getVectorNumElements())) + continue; + // Do not attempt to custom lower non-128-bit vectors + if (!VT.is128BitVector()) + continue; + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + } + + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); + + if (Subtarget->is64Bit()) { + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); + } + + // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. + for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { + MVT VT = (MVT::SimpleValueType)i; + + // Do not attempt to promote non-128-bit vectors + if (!VT.is128BitVector()) + continue; + + setOperationAction(ISD::AND, VT, Promote); + AddPromotedToType (ISD::AND, VT, MVT::v2i64); + setOperationAction(ISD::OR, VT, Promote); + AddPromotedToType (ISD::OR, VT, MVT::v2i64); + setOperationAction(ISD::XOR, VT, Promote); + AddPromotedToType (ISD::XOR, VT, MVT::v2i64); + setOperationAction(ISD::LOAD, VT, Promote); + AddPromotedToType (ISD::LOAD, VT, MVT::v2i64); + setOperationAction(ISD::SELECT, VT, Promote); + AddPromotedToType (ISD::SELECT, VT, MVT::v2i64); + } + + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + + // Custom lower v2i64 and v2f64 selects. + setOperationAction(ISD::LOAD, MVT::v2f64, Legal); + setOperationAction(ISD::LOAD, MVT::v2i64, Legal); + setOperationAction(ISD::SELECT, MVT::v2f64, Custom); + setOperationAction(ISD::SELECT, MVT::v2i64, Custom); + + setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + + setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); + // As there is no 64-bit GPR available, we need build a special custom + // sequence to convert from v2i32 to v2f32. + if (!Subtarget->is64Bit()) + setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); + + setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); + setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); + + setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal); + } + + if (Subtarget->hasSSE41()) { + setOperationAction(ISD::FFLOOR, MVT::f32, Legal); + setOperationAction(ISD::FCEIL, MVT::f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::f32, Legal); + setOperationAction(ISD::FRINT, MVT::f32, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::f64, Legal); + setOperationAction(ISD::FCEIL, MVT::f64, Legal); + setOperationAction(ISD::FTRUNC, MVT::f64, Legal); + setOperationAction(ISD::FRINT, MVT::f64, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); + + setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); + setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); + setOperationAction(ISD::FRINT, MVT::v4f32, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); + setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); + setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); + setOperationAction(ISD::FRINT, MVT::v2f64, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); + + // FIXME: Do we need to handle scalar-to-vector here? + setOperationAction(ISD::MUL, MVT::v4i32, Legal); + + setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); + setOperationAction(ISD::VSELECT, MVT::v2i64, Legal); + setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); + setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); + setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); + + // i8 and i16 vectors are custom , because the source register and source + // source memory operand types are not the same width. f32 vectors are + // custom since the immediate controlling the insert encodes additional + // information. + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); + + // FIXME: these should be Legal but thats only for the case where + // the index is constant. For now custom expand to deal with that. + if (Subtarget->is64Bit()) { + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); + } + } + + if (Subtarget->hasSSE2()) { + setOperationAction(ISD::SRL, MVT::v8i16, Custom); + setOperationAction(ISD::SRL, MVT::v16i8, Custom); + + setOperationAction(ISD::SHL, MVT::v8i16, Custom); + setOperationAction(ISD::SHL, MVT::v16i8, Custom); + + setOperationAction(ISD::SRA, MVT::v8i16, Custom); + setOperationAction(ISD::SRA, MVT::v16i8, Custom); + + // In the customized shift lowering, the legal cases in AVX2 will be + // recognized. + setOperationAction(ISD::SRL, MVT::v2i64, Custom); + setOperationAction(ISD::SRL, MVT::v4i32, Custom); + + setOperationAction(ISD::SHL, MVT::v2i64, Custom); + setOperationAction(ISD::SHL, MVT::v4i32, Custom); + + setOperationAction(ISD::SRA, MVT::v4i32, Custom); + + setOperationAction(ISD::SDIV, MVT::v8i16, Custom); + setOperationAction(ISD::SDIV, MVT::v4i32, Custom); + } + + if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) { + addRegisterClass(MVT::v32i8, &X86::VR256RegClass); + addRegisterClass(MVT::v16i16, &X86::VR256RegClass); + addRegisterClass(MVT::v8i32, &X86::VR256RegClass); + addRegisterClass(MVT::v8f32, &X86::VR256RegClass); + addRegisterClass(MVT::v4i64, &X86::VR256RegClass); + addRegisterClass(MVT::v4f64, &X86::VR256RegClass); + + setOperationAction(ISD::LOAD, MVT::v8f32, Legal); + setOperationAction(ISD::LOAD, MVT::v4f64, Legal); + setOperationAction(ISD::LOAD, MVT::v4i64, Legal); + + setOperationAction(ISD::FADD, MVT::v8f32, Legal); + setOperationAction(ISD::FSUB, MVT::v8f32, Legal); + setOperationAction(ISD::FMUL, MVT::v8f32, Legal); + setOperationAction(ISD::FDIV, MVT::v8f32, Legal); + setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal); + setOperationAction(ISD::FCEIL, MVT::v8f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::v8f32, Legal); + setOperationAction(ISD::FRINT, MVT::v8f32, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v8f32, Legal); + setOperationAction(ISD::FNEG, MVT::v8f32, Custom); + setOperationAction(ISD::FABS, MVT::v8f32, Custom); + + setOperationAction(ISD::FADD, MVT::v4f64, Legal); + setOperationAction(ISD::FSUB, MVT::v4f64, Legal); + setOperationAction(ISD::FMUL, MVT::v4f64, Legal); + setOperationAction(ISD::FDIV, MVT::v4f64, Legal); + setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); + setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); + setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); + setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); + setOperationAction(ISD::FRINT, MVT::v4f64, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Legal); + setOperationAction(ISD::FNEG, MVT::v4f64, Custom); + setOperationAction(ISD::FABS, MVT::v4f64, Custom); + + setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); + + setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); + + setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); + + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); + + setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, Legal); + + setOperationAction(ISD::SRL, MVT::v16i16, Custom); + setOperationAction(ISD::SRL, MVT::v32i8, Custom); + + setOperationAction(ISD::SHL, MVT::v16i16, Custom); + setOperationAction(ISD::SHL, MVT::v32i8, Custom); + + setOperationAction(ISD::SRA, MVT::v16i16, Custom); + setOperationAction(ISD::SRA, MVT::v32i8, Custom); + + setOperationAction(ISD::SDIV, MVT::v16i16, Custom); + + setOperationAction(ISD::SETCC, MVT::v32i8, Custom); + setOperationAction(ISD::SETCC, MVT::v16i16, Custom); + setOperationAction(ISD::SETCC, MVT::v8i32, Custom); + setOperationAction(ISD::SETCC, MVT::v4i64, Custom); + + setOperationAction(ISD::SELECT, MVT::v4f64, Custom); + setOperationAction(ISD::SELECT, MVT::v4i64, Custom); + setOperationAction(ISD::SELECT, MVT::v8f32, Custom); + + setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); + setOperationAction(ISD::VSELECT, MVT::v4i64, Legal); + setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); + setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); + + setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom); + + if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { + setOperationAction(ISD::FMA, MVT::v8f32, Legal); + setOperationAction(ISD::FMA, MVT::v4f64, Legal); + setOperationAction(ISD::FMA, MVT::v4f32, Legal); + setOperationAction(ISD::FMA, MVT::v2f64, Legal); + setOperationAction(ISD::FMA, MVT::f32, Legal); + setOperationAction(ISD::FMA, MVT::f64, Legal); + } + + if (Subtarget->hasInt256()) { + setOperationAction(ISD::ADD, MVT::v4i64, Legal); + setOperationAction(ISD::ADD, MVT::v8i32, Legal); + setOperationAction(ISD::ADD, MVT::v16i16, Legal); + setOperationAction(ISD::ADD, MVT::v32i8, Legal); + + setOperationAction(ISD::SUB, MVT::v4i64, Legal); + setOperationAction(ISD::SUB, MVT::v8i32, Legal); + setOperationAction(ISD::SUB, MVT::v16i16, Legal); + setOperationAction(ISD::SUB, MVT::v32i8, Legal); + + setOperationAction(ISD::MUL, MVT::v4i64, Custom); + setOperationAction(ISD::MUL, MVT::v8i32, Legal); + setOperationAction(ISD::MUL, MVT::v16i16, Legal); + // Don't lower v32i8 because there is no 128-bit byte mul + + setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); + + setOperationAction(ISD::SDIV, MVT::v8i32, Custom); + } else { + setOperationAction(ISD::ADD, MVT::v4i64, Custom); + setOperationAction(ISD::ADD, MVT::v8i32, Custom); + setOperationAction(ISD::ADD, MVT::v16i16, Custom); + setOperationAction(ISD::ADD, MVT::v32i8, Custom); + + setOperationAction(ISD::SUB, MVT::v4i64, Custom); + setOperationAction(ISD::SUB, MVT::v8i32, Custom); + setOperationAction(ISD::SUB, MVT::v16i16, Custom); + setOperationAction(ISD::SUB, MVT::v32i8, Custom); + + setOperationAction(ISD::MUL, MVT::v4i64, Custom); + setOperationAction(ISD::MUL, MVT::v8i32, Custom); + setOperationAction(ISD::MUL, MVT::v16i16, Custom); + // Don't lower v32i8 because there is no 128-bit byte mul + } + + // In the customized shift lowering, the legal cases in AVX2 will be + // recognized. + setOperationAction(ISD::SRL, MVT::v4i64, Custom); + setOperationAction(ISD::SRL, MVT::v8i32, Custom); + + setOperationAction(ISD::SHL, MVT::v4i64, Custom); + setOperationAction(ISD::SHL, MVT::v8i32, Custom); + + setOperationAction(ISD::SRA, MVT::v8i32, Custom); + + // Custom lower several nodes for 256-bit types. + for (int i = MVT::FIRST_VECTOR_VALUETYPE; + i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { + MVT VT = (MVT::SimpleValueType)i; + + // Extract subvector is special because the value type + // (result) is 128-bit but the source is 256-bit wide. + if (VT.is128BitVector()) + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + + // Do not attempt to custom lower other non-256-bit vectors + if (!VT.is256BitVector()) + continue; + + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + } + + // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. + for (int i = MVT::v32i8; i != MVT::v4i64; ++i) { + MVT VT = (MVT::SimpleValueType)i; + + // Do not attempt to promote non-256-bit vectors + if (!VT.is256BitVector()) + continue; + + setOperationAction(ISD::AND, VT, Promote); + AddPromotedToType (ISD::AND, VT, MVT::v4i64); + setOperationAction(ISD::OR, VT, Promote); + AddPromotedToType (ISD::OR, VT, MVT::v4i64); + setOperationAction(ISD::XOR, VT, Promote); + AddPromotedToType (ISD::XOR, VT, MVT::v4i64); + setOperationAction(ISD::LOAD, VT, Promote); + AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); + setOperationAction(ISD::SELECT, VT, Promote); + AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); + } + } + + // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion + // of this type with custom code. + for (int VT = MVT::FIRST_VECTOR_VALUETYPE; + VT != MVT::LAST_VECTOR_VALUETYPE; VT++) { + setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, + Custom); + } + + // We want to custom lower some of our intrinsics. + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + + // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't + // handle type legalization for these operations here. + // + // FIXME: We really should do custom legalization for addition and + // subtraction on x86-32 once PR3203 is fixed. We really can't do much better + // than generic legalization for 64-bit multiplication-with-overflow, though. + for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { + // Add/Sub/Mul with overflow operations are custom lowered. + MVT VT = IntVTs[i]; + setOperationAction(ISD::SADDO, VT, Custom); + setOperationAction(ISD::UADDO, VT, Custom); + setOperationAction(ISD::SSUBO, VT, Custom); + setOperationAction(ISD::USUBO, VT, Custom); + setOperationAction(ISD::SMULO, VT, Custom); + setOperationAction(ISD::UMULO, VT, Custom); + } + + // There are no 8-bit 3-address imul/mul instructions + setOperationAction(ISD::SMULO, MVT::i8, Expand); + setOperationAction(ISD::UMULO, MVT::i8, Expand); + + if (!Subtarget->is64Bit()) { + // These libcalls are not available in 32-bit. + setLibcallName(RTLIB::SHL_I128, 0); + setLibcallName(RTLIB::SRL_I128, 0); + setLibcallName(RTLIB::SRA_I128, 0); + } + + // Combine sin / cos into one node or libcall if possible. + if (Subtarget->hasSinCos()) { + setLibcallName(RTLIB::SINCOS_F32, "sincosf"); + setLibcallName(RTLIB::SINCOS_F64, "sincos"); + if (Subtarget->isTargetDarwin()) { + // For MacOSX, we don't want to the normal expansion of a libcall to + // sincos. We want to issue a libcall to __sincos_stret to avoid memory + // traffic. + setOperationAction(ISD::FSINCOS, MVT::f64, Custom); + setOperationAction(ISD::FSINCOS, MVT::f32, Custom); + } + } + + // We have target-specific dag combine patterns for the following nodes: + setTargetDAGCombine(ISD::VECTOR_SHUFFLE); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::VSELECT); + setTargetDAGCombine(ISD::SELECT); + setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SRA); + setTargetDAGCombine(ISD::SRL); + setTargetDAGCombine(ISD::OR); + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::ADD); + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::FSUB); + setTargetDAGCombine(ISD::FMA); + setTargetDAGCombine(ISD::SUB); + setTargetDAGCombine(ISD::LOAD); + setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::ANY_EXTEND); + setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); + setTargetDAGCombine(ISD::TRUNCATE); + setTargetDAGCombine(ISD::SINT_TO_FP); + setTargetDAGCombine(ISD::SETCC); + if (Subtarget->is64Bit()) + setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::XOR); + + computeRegisterProperties(); + + // On Darwin, -Os means optimize for size without hurting performance, + // do not reduce the limit. + MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores + MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; + MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores + MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores + MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + setPrefLoopAlignment(4); // 2^4 bytes. + + // Predictable cmov don't hurt on atom because it's in-order. + PredictableSelectIsExpensive = !Subtarget->isAtom(); + + setPrefFunctionAlignment(4); // 2^4 bytes. +} + +EVT X86TargetLowering::getSetCCResultType(EVT VT) const { + if (!VT.isVector()) return MVT::i8; + return VT.changeVectorElementTypeToInteger(); +} + +/// getMaxByValAlign - Helper for getByValTypeAlignment to determine +/// the desired ByVal argument alignment. +static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { + if (MaxAlign == 16) + return; + if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { + if (VTy->getBitWidth() == 128) + MaxAlign = 16; + } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { + unsigned EltAlign = 0; + getMaxByValAlign(ATy->getElementType(), EltAlign); + if (EltAlign > MaxAlign) + MaxAlign = EltAlign; + } else if (StructType *STy = dyn_cast<StructType>(Ty)) { + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + unsigned EltAlign = 0; + getMaxByValAlign(STy->getElementType(i), EltAlign); + if (EltAlign > MaxAlign) + MaxAlign = EltAlign; + if (MaxAlign == 16) + break; + } + } +} + +/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate +/// function arguments in the caller parameter area. For X86, aggregates +/// that contain SSE vectors are placed at 16-byte boundaries while the rest +/// are at 4-byte boundaries. +unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { + if (Subtarget->is64Bit()) { + // Max of 8 and alignment of type. + unsigned TyAlign = TD->getABITypeAlignment(Ty); + if (TyAlign > 8) + return TyAlign; + return 8; + } + + unsigned Align = 4; + if (Subtarget->hasSSE1()) + getMaxByValAlign(Ty, Align); + return Align; +} + +/// getOptimalMemOpType - Returns the target specific optimal type for load +/// and store operations as a result of memset, memcpy, and memmove +/// lowering. If DstAlign is zero that means it's safe to destination +/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it +/// means there isn't a need to check it against alignment requirement, +/// probably because the source does not need to be loaded. If 'IsMemset' is +/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that +/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy +/// source is constant so it does not need to be loaded. +/// It returns EVT::Other if the type should be determined using generic +/// target-independent logic. +EVT +X86TargetLowering::getOptimalMemOpType(uint64_t Size, + unsigned DstAlign, unsigned SrcAlign, + bool IsMemset, bool ZeroMemset, + bool MemcpyStrSrc, + MachineFunction &MF) const { + const Function *F = MF.getFunction(); + if ((!IsMemset || ZeroMemset) && + !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::NoImplicitFloat)) { + if (Size >= 16 && + (Subtarget->isUnalignedMemAccessFast() || + ((DstAlign == 0 || DstAlign >= 16) && + (SrcAlign == 0 || SrcAlign >= 16)))) { + if (Size >= 32) { + if (Subtarget->hasInt256()) + return MVT::v8i32; + if (Subtarget->hasFp256()) + return MVT::v8f32; + } + if (Subtarget->hasSSE2()) + return MVT::v4i32; + if (Subtarget->hasSSE1()) + return MVT::v4f32; + } else if (!MemcpyStrSrc && Size >= 8 && + !Subtarget->is64Bit() && + Subtarget->hasSSE2()) { + // Do not use f64 to lower memcpy if source is string constant. It's + // better to use i32 to avoid the loads. + return MVT::f64; + } + } + if (Subtarget->is64Bit() && Size >= 8) + return MVT::i64; + return MVT::i32; +} + +bool X86TargetLowering::isSafeMemOpType(MVT VT) const { + if (VT == MVT::f32) + return X86ScalarSSEf32; + else if (VT == MVT::f64) + return X86ScalarSSEf64; + return true; +} + +bool +X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const { + if (Fast) + *Fast = Subtarget->isUnalignedMemAccessFast(); + return true; +} + +/// getJumpTableEncoding - Return the entry encoding for a jump table in the +/// current function. The returned value is a member of the +/// MachineJumpTableInfo::JTEntryKind enum. +unsigned X86TargetLowering::getJumpTableEncoding() const { + // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF + // symbol. + if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && + Subtarget->isPICStyleGOT()) + return MachineJumpTableInfo::EK_Custom32; + + // Otherwise, use the normal jump table encoding heuristics. + return TargetLowering::getJumpTableEncoding(); +} + +const MCExpr * +X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, + const MachineBasicBlock *MBB, + unsigned uid,MCContext &Ctx) const{ + assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && + Subtarget->isPICStyleGOT()); + // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF + // entries. + return MCSymbolRefExpr::Create(MBB->getSymbol(), + MCSymbolRefExpr::VK_GOTOFF, Ctx); +} + +/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC +/// jumptable. +SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, + SelectionDAG &DAG) const { + if (!Subtarget->is64Bit()) + // This doesn't have DebugLoc associated with it, but is not really the + // same as a Register. + return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); + return Table; +} + +/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the +/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an +/// MCExpr. +const MCExpr *X86TargetLowering:: +getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, + MCContext &Ctx) const { + // X86-64 uses RIP relative addressing based on the jump table label. + if (Subtarget->isPICStyleRIPRel()) + return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); + + // Otherwise, the reference is relative to the PIC base. + return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); +} + +// FIXME: Why this routine is here? Move to RegInfo! +std::pair<const TargetRegisterClass*, uint8_t> +X86TargetLowering::findRepresentativeClass(MVT VT) const{ + const TargetRegisterClass *RRC = 0; + uint8_t Cost = 1; + switch (VT.SimpleTy) { + default: + return TargetLowering::findRepresentativeClass(VT); + case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: + RRC = Subtarget->is64Bit() ? + (const TargetRegisterClass*)&X86::GR64RegClass : + (const TargetRegisterClass*)&X86::GR32RegClass; + break; + case MVT::x86mmx: + RRC = &X86::VR64RegClass; + break; + case MVT::f32: case MVT::f64: + case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: + case MVT::v4f32: case MVT::v2f64: + case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: + case MVT::v4f64: + RRC = &X86::VR128RegClass; + break; + } + return std::make_pair(RRC, Cost); +} + +bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, + unsigned &Offset) const { + if (!Subtarget->isTargetLinux()) + return false; + + if (Subtarget->is64Bit()) { + // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: + Offset = 0x28; + if (getTargetMachine().getCodeModel() == CodeModel::Kernel) + AddressSpace = 256; + else + AddressSpace = 257; + } else { + // %gs:0x14 on i386 + Offset = 0x14; + AddressSpace = 256; + } + return true; +} + +//===----------------------------------------------------------------------===// +// Return Value Calling Convention Implementation +//===----------------------------------------------------------------------===// + +#include "X86GenCallingConv.inc" + +bool +X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, + MachineFunction &MF, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const { + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), + RVLocs, Context); + return CCInfo.CheckReturn(Outs, RetCC_X86); +} + +SDValue +X86TargetLowering::LowerReturn(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + DebugLoc dl, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), + RVLocs, *DAG.getContext()); + CCInfo.AnalyzeReturn(Outs, RetCC_X86); + + SDValue Flag; + SmallVector<SDValue, 6> RetOps; + RetOps.push_back(Chain); // Operand #0 = Chain (updated below) + // Operand #1 = Bytes To Pop + RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), + MVT::i16)); + + // Copy the result values into the output registers. + for (unsigned i = 0; i != RVLocs.size(); ++i) { + CCValAssign &VA = RVLocs[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + SDValue ValToCopy = OutVals[i]; + EVT ValVT = ValToCopy.getValueType(); + + // Promote values to the appropriate types + if (VA.getLocInfo() == CCValAssign::SExt) + ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); + else if (VA.getLocInfo() == CCValAssign::ZExt) + ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); + else if (VA.getLocInfo() == CCValAssign::AExt) + ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); + else if (VA.getLocInfo() == CCValAssign::BCvt) + ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy); + + // If this is x86-64, and we disabled SSE, we can't return FP values, + // or SSE or MMX vectors. + if ((ValVT == MVT::f32 || ValVT == MVT::f64 || + VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && + (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { + report_fatal_error("SSE register return with SSE disabled"); + } + // Likewise we can't return F64 values with SSE1 only. gcc does so, but + // llvm-gcc has never done it right and no one has noticed, so this + // should be OK for now. + if (ValVT == MVT::f64 && + (Subtarget->is64Bit() && !Subtarget->hasSSE2())) + report_fatal_error("SSE2 register return with SSE2 disabled"); + + // Returns in ST0/ST1 are handled specially: these are pushed as operands to + // the RET instruction and handled by the FP Stackifier. + if (VA.getLocReg() == X86::ST0 || + VA.getLocReg() == X86::ST1) { + // If this is a copy from an xmm register to ST(0), use an FPExtend to + // change the value to the FP stack register class. + if (isScalarFPTypeInSSEReg(VA.getValVT())) + ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); + RetOps.push_back(ValToCopy); + // Don't emit a copytoreg. + continue; + } + + // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 + // which is returned in RAX / RDX. + if (Subtarget->is64Bit()) { + if (ValVT == MVT::x86mmx) { + if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { + ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); + ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, + ValToCopy); + // If we don't have SSE2 available, convert to v4f32 so the generated + // register is legal. + if (!Subtarget->hasSSE2()) + ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); + } + } + } + + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); + Flag = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + } + + // The x86-64 ABIs require that for returning structs by value we copy + // the sret argument into %rax/%eax (depending on ABI) for the return. + // Win32 requires us to put the sret argument to %eax as well. + // We saved the argument into a virtual register in the entry block, + // so now we copy the value out and into %rax/%eax. + if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() && + (Subtarget->is64Bit() || Subtarget->isTargetWindows())) { + MachineFunction &MF = DAG.getMachineFunction(); + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + unsigned Reg = FuncInfo->getSRetReturnReg(); + assert(Reg && + "SRetReturnReg should have been set in LowerFormalArguments()."); + SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); + + unsigned RetValReg + = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ? + X86::RAX : X86::EAX; + Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag); + Flag = Chain.getValue(1); + + // RAX/EAX now acts like a return value. + RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy())); + } + + RetOps[0] = Chain; // Update chain. + + // Add the flag if we have it. + if (Flag.getNode()) + RetOps.push_back(Flag); + + return DAG.getNode(X86ISD::RET_FLAG, dl, + MVT::Other, &RetOps[0], RetOps.size()); +} + +bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { + if (N->getNumValues() != 1) + return false; + if (!N->hasNUsesOfValue(1, 0)) + return false; + + SDValue TCChain = Chain; + SDNode *Copy = *N->use_begin(); + if (Copy->getOpcode() == ISD::CopyToReg) { + // If the copy has a glue operand, we conservatively assume it isn't safe to + // perform a tail call. + if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) + return false; + TCChain = Copy->getOperand(0); + } else if (Copy->getOpcode() != ISD::FP_EXTEND) + return false; + + bool HasRet = false; + for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); + UI != UE; ++UI) { + if (UI->getOpcode() != X86ISD::RET_FLAG) + return false; + HasRet = true; + } + + if (!HasRet) + return false; + + Chain = TCChain; + return true; +} + +MVT +X86TargetLowering::getTypeForExtArgOrReturn(MVT VT, + ISD::NodeType ExtendKind) const { + MVT ReturnMVT; + // TODO: Is this also valid on 32-bit? + if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) + ReturnMVT = MVT::i8; + else + ReturnMVT = MVT::i32; + + MVT MinVT = getRegisterType(ReturnMVT); + return VT.bitsLT(MinVT) ? MinVT : VT; +} + +/// LowerCallResult - Lower the result values of a call into the +/// appropriate copies out of appropriate physical registers. +/// +SDValue +X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + + // Assign locations to each value returned by this call. + SmallVector<CCValAssign, 16> RVLocs; + bool Is64Bit = Subtarget->is64Bit(); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); + CCInfo.AnalyzeCallResult(Ins, RetCC_X86); + + // Copy all of the result registers out of their specified physreg. + for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { + CCValAssign &VA = RVLocs[i]; + EVT CopyVT = VA.getValVT(); + + // If this is x86-64, and we disabled SSE, we can't return FP values + if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && + ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { + report_fatal_error("SSE register return with SSE disabled"); + } + + SDValue Val; + + // If this is a call to a function that returns an fp value on the floating + // point stack, we must guarantee the value is popped from the stack, so + // a CopyFromReg is not good enough - the copy instruction may be eliminated + // if the return value is not used. We use the FpPOP_RETVAL instruction + // instead. + if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { + // If we prefer to use the value in xmm registers, copy it out as f80 and + // use a truncate to move it from fp stack reg to xmm reg. + if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; + SDValue Ops[] = { Chain, InFlag }; + Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT, + MVT::Other, MVT::Glue, Ops), 1); + Val = Chain.getValue(0); + + // Round the f80 to the right size, which also moves it to the appropriate + // xmm register. + if (CopyVT != VA.getValVT()) + Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, + // This truncation won't change the value. + DAG.getIntPtrConstant(1)); + } else { + Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), + CopyVT, InFlag).getValue(1); + Val = Chain.getValue(0); + } + InFlag = Chain.getValue(2); + InVals.push_back(Val); + } + + return Chain; +} + +//===----------------------------------------------------------------------===// +// C & StdCall & Fast Calling Convention implementation +//===----------------------------------------------------------------------===// +// StdCall calling convention seems to be standard for many Windows' API +// routines and around. It differs from C calling convention just a little: +// callee should clean up the stack, not caller. Symbols should be also +// decorated in some fancy way :) It doesn't support any vector arguments. +// For info on fast calling convention see Fast Calling Convention (tail call) +// implementation LowerX86_32FastCCCallTo. + +/// CallIsStructReturn - Determines whether a call uses struct return +/// semantics. +enum StructReturnType { + NotStructReturn, + RegStructReturn, + StackStructReturn +}; +static StructReturnType +callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { + if (Outs.empty()) + return NotStructReturn; + + const ISD::ArgFlagsTy &Flags = Outs[0].Flags; + if (!Flags.isSRet()) + return NotStructReturn; + if (Flags.isInReg()) + return RegStructReturn; + return StackStructReturn; +} + +/// ArgsAreStructReturn - Determines whether a function uses struct +/// return semantics. +static StructReturnType +argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { + if (Ins.empty()) + return NotStructReturn; + + const ISD::ArgFlagsTy &Flags = Ins[0].Flags; + if (!Flags.isSRet()) + return NotStructReturn; + if (Flags.isInReg()) + return RegStructReturn; + return StackStructReturn; +} + +/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified +/// by "Src" to address "Dst" with size and alignment information specified by +/// the specific parameter attribute. The copy will be passed as a byval +/// function parameter. +static SDValue +CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, + ISD::ArgFlagsTy Flags, SelectionDAG &DAG, + DebugLoc dl) { + SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); + + return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), + /*isVolatile*/false, /*AlwaysInline=*/true, + MachinePointerInfo(), MachinePointerInfo()); +} + +/// IsTailCallConvention - Return true if the calling convention is one that +/// supports tail call optimization. +static bool IsTailCallConvention(CallingConv::ID CC) { + return (CC == CallingConv::Fast || CC == CallingConv::GHC || + CC == CallingConv::HiPE); +} + +bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { + if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) + return false; + + CallSite CS(CI); + CallingConv::ID CalleeCC = CS.getCallingConv(); + if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) + return false; + + return true; +} + +/// FuncIsMadeTailCallSafe - Return true if the function is being made into +/// a tailcall target by changing its ABI. +static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, + bool GuaranteedTailCallOpt) { + return GuaranteedTailCallOpt && IsTailCallConvention(CC); +} + +SDValue +X86TargetLowering::LowerMemArgument(SDValue Chain, + CallingConv::ID CallConv, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, + MachineFrameInfo *MFI, + unsigned i) const { + // Create the nodes corresponding to a load from this parameter slot. + ISD::ArgFlagsTy Flags = Ins[i].Flags; + bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv, + getTargetMachine().Options.GuaranteedTailCallOpt); + bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); + EVT ValVT; + + // If value is passed by pointer we have address passed instead of the value + // itself. + if (VA.getLocInfo() == CCValAssign::Indirect) + ValVT = VA.getLocVT(); + else + ValVT = VA.getValVT(); + + // FIXME: For now, all byval parameter objects are marked mutable. This can be + // changed with more analysis. + // In case of tail call optimization mark all arguments mutable. Since they + // could be overwritten by lowering of arguments in case of a tail call. + if (Flags.isByVal()) { + unsigned Bytes = Flags.getByValSize(); + if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. + int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); + return DAG.getFrameIndex(FI, getPointerTy()); + } else { + int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, + VA.getLocMemOffset(), isImmutable); + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + return DAG.getLoad(ValVT, dl, Chain, FIN, + MachinePointerInfo::getFixedStack(FI), + false, false, false, 0); + } +} + +SDValue +X86TargetLowering::LowerFormalArguments(SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, + SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) + const { + MachineFunction &MF = DAG.getMachineFunction(); + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + + const Function* Fn = MF.getFunction(); + if (Fn->hasExternalLinkage() && + Subtarget->isTargetCygMing() && + Fn->getName() == "main") + FuncInfo->setForceFramePointer(true); + + MachineFrameInfo *MFI = MF.getFrameInfo(); + bool Is64Bit = Subtarget->is64Bit(); + bool IsWindows = Subtarget->isTargetWindows(); + bool IsWin64 = Subtarget->isTargetWin64(); + + assert(!(isVarArg && IsTailCallConvention(CallConv)) && + "Var args not supported with calling convention fastcc, ghc or hipe"); + + // Assign locations to all of the incoming arguments. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), + ArgLocs, *DAG.getContext()); + + // Allocate shadow area for Win64 + if (IsWin64) { + CCInfo.AllocateStack(32, 8); + } + + CCInfo.AnalyzeFormalArguments(Ins, CC_X86); + + unsigned LastVal = ~0U; + SDValue ArgValue; + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + // TODO: If an arg is passed in two places (e.g. reg and stack), skip later + // places. + assert(VA.getValNo() != LastVal && + "Don't support value assigned to multiple locs yet"); + (void)LastVal; + LastVal = VA.getValNo(); + + if (VA.isRegLoc()) { + EVT RegVT = VA.getLocVT(); + const TargetRegisterClass *RC; + if (RegVT == MVT::i32) + RC = &X86::GR32RegClass; + else if (Is64Bit && RegVT == MVT::i64) + RC = &X86::GR64RegClass; + else if (RegVT == MVT::f32) + RC = &X86::FR32RegClass; + else if (RegVT == MVT::f64) + RC = &X86::FR64RegClass; + else if (RegVT.is256BitVector()) + RC = &X86::VR256RegClass; + else if (RegVT.is128BitVector()) + RC = &X86::VR128RegClass; + else if (RegVT == MVT::x86mmx) + RC = &X86::VR64RegClass; + else + llvm_unreachable("Unknown argument type!"); + + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); + + // If this is an 8 or 16-bit value, it is really passed promoted to 32 + // bits. Insert an assert[sz]ext to capture this, then truncate to the + // right size. + if (VA.getLocInfo() == CCValAssign::SExt) + ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + else if (VA.getLocInfo() == CCValAssign::ZExt) + ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + else if (VA.getLocInfo() == CCValAssign::BCvt) + ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); + + if (VA.isExtInLoc()) { + // Handle MMX values passed in XMM regs. + if (RegVT.isVector()) + ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); + else + ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); + } + } else { + assert(VA.isMemLoc()); + ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); + } + + // If value is passed via pointer - do a load. + if (VA.getLocInfo() == CCValAssign::Indirect) + ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, + MachinePointerInfo(), false, false, false, 0); + + InVals.push_back(ArgValue); + } + + // The x86-64 ABIs require that for returning structs by value we copy + // the sret argument into %rax/%eax (depending on ABI) for the return. + // Win32 requires us to put the sret argument to %eax as well. + // Save the argument into a virtual register so that we can access it + // from the return points. + if (MF.getFunction()->hasStructRetAttr() && + (Subtarget->is64Bit() || Subtarget->isTargetWindows())) { + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + unsigned Reg = FuncInfo->getSRetReturnReg(); + if (!Reg) { + MVT PtrTy = getPointerTy(); + Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); + FuncInfo->setSRetReturnReg(Reg); + } + SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); + } + + unsigned StackSize = CCInfo.getNextStackOffset(); + // Align stack specially for tail calls. + if (FuncIsMadeTailCallSafe(CallConv, + MF.getTarget().Options.GuaranteedTailCallOpt)) + StackSize = GetAlignedArgumentStackSize(StackSize, DAG); + + // If the function takes variable number of arguments, make a frame index for + // the start of the first vararg value... for expansion of llvm.va_start. + if (isVarArg) { + if (Is64Bit || (CallConv != CallingConv::X86_FastCall && + CallConv != CallingConv::X86_ThisCall)) { + FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); + } + if (Is64Bit) { + unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; + + // FIXME: We should really autogenerate these arrays + static const uint16_t GPR64ArgRegsWin64[] = { + X86::RCX, X86::RDX, X86::R8, X86::R9 + }; + static const uint16_t GPR64ArgRegs64Bit[] = { + X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 + }; + static const uint16_t XMMArgRegs64Bit[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; + const uint16_t *GPR64ArgRegs; + unsigned NumXMMRegs = 0; + + if (IsWin64) { + // The XMM registers which might contain var arg parameters are shadowed + // in their paired GPR. So we only need to save the GPR to their home + // slots. + TotalNumIntRegs = 4; + GPR64ArgRegs = GPR64ArgRegsWin64; + } else { + TotalNumIntRegs = 6; TotalNumXMMRegs = 8; + GPR64ArgRegs = GPR64ArgRegs64Bit; + + NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, + TotalNumXMMRegs); + } + unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, + TotalNumIntRegs); + + bool NoImplicitFloatOps = Fn->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); + assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && + "SSE register cannot be used when SSE is disabled!"); + assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat && + NoImplicitFloatOps) && + "SSE register cannot be used when SSE is disabled!"); + if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || + !Subtarget->hasSSE1()) + // Kernel mode asks for SSE to be disabled, so don't push them + // on the stack. + TotalNumXMMRegs = 0; + + if (IsWin64) { + const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); + // Get to the caller-allocated home save location. Add 8 to account + // for the return address. + int HomeOffset = TFI.getOffsetOfLocalArea() + 8; + FuncInfo->setRegSaveFrameIndex( + MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); + // Fixup to set vararg frame on shadow area (4 x i64). + if (NumIntRegs < 4) + FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); + } else { + // For X86-64, if there are vararg parameters that are passed via + // registers, then we must store them to their spots on the stack so + // they may be loaded by deferencing the result of va_next. + FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); + FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); + FuncInfo->setRegSaveFrameIndex( + MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, + false)); + } + + // Store the integer parameter registers. + SmallVector<SDValue, 8> MemOps; + SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), + getPointerTy()); + unsigned Offset = FuncInfo->getVarArgsGPOffset(); + for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { + SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, + DAG.getIntPtrConstant(Offset)); + unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], + &X86::GR64RegClass); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo::getFixedStack( + FuncInfo->getRegSaveFrameIndex(), Offset), + false, false, 0); + MemOps.push_back(Store); + Offset += 8; + } + + if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { + // Now store the XMM (fp + vector) parameter registers. + SmallVector<SDValue, 11> SaveXMMOps; + SaveXMMOps.push_back(Chain); + + unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); + SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); + SaveXMMOps.push_back(ALVal); + + SaveXMMOps.push_back(DAG.getIntPtrConstant( + FuncInfo->getRegSaveFrameIndex())); + SaveXMMOps.push_back(DAG.getIntPtrConstant( + FuncInfo->getVarArgsFPOffset())); + + for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { + unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], + &X86::VR128RegClass); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); + SaveXMMOps.push_back(Val); + } + MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, + MVT::Other, + &SaveXMMOps[0], SaveXMMOps.size())); + } + + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &MemOps[0], MemOps.size()); + } + } + + // Some CCs need callee pop. + if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, + MF.getTarget().Options.GuaranteedTailCallOpt)) { + FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. + } else { + FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. + // If this is an sret function, the return should pop the hidden pointer. + if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && + argsAreStructReturn(Ins) == StackStructReturn) + FuncInfo->setBytesToPopOnReturn(4); + } + + if (!Is64Bit) { + // RegSaveFrameIndex is X86-64 only. + FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); + if (CallConv == CallingConv::X86_FastCall || + CallConv == CallingConv::X86_ThisCall) + // fastcc functions can't have varargs. + FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); + } + + FuncInfo->setArgumentStackSize(StackSize); + + return Chain; +} + +SDValue +X86TargetLowering::LowerMemOpCallTo(SDValue Chain, + SDValue StackPtr, SDValue Arg, + DebugLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, + ISD::ArgFlagsTy Flags) const { + unsigned LocMemOffset = VA.getLocMemOffset(); + SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); + PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); + if (Flags.isByVal()) + return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); + + return DAG.getStore(Chain, dl, Arg, PtrOff, + MachinePointerInfo::getStack(LocMemOffset), + false, false, 0); +} + +/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call +/// optimization is performed and it is required. +SDValue +X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, + SDValue &OutRetAddr, SDValue Chain, + bool IsTailCall, bool Is64Bit, + int FPDiff, DebugLoc dl) const { + // Adjust the Return address stack slot. + EVT VT = getPointerTy(); + OutRetAddr = getReturnAddressFrameIndex(DAG); + + // Load the "old" Return address. + OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), + false, false, false, 0); + return SDValue(OutRetAddr.getNode(), 1); +} + +/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call +/// optimization is performed and it is required (FPDiff!=0). +static SDValue +EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, + SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT, + unsigned SlotSize, int FPDiff, DebugLoc dl) { + // Store the return address to the appropriate stack slot. + if (!FPDiff) return Chain; + // Calculate the new stack slot for the return address. + int NewReturnAddrFI = + MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); + SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); + Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, + MachinePointerInfo::getFixedStack(NewReturnAddrFI), + false, false, 0); + return Chain; +} + +SDValue +X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + DebugLoc &dl = CLI.DL; + SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; + SmallVector<SDValue, 32> &OutVals = CLI.OutVals; + SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + CallingConv::ID CallConv = CLI.CallConv; + bool &isTailCall = CLI.IsTailCall; + bool isVarArg = CLI.IsVarArg; + + MachineFunction &MF = DAG.getMachineFunction(); + bool Is64Bit = Subtarget->is64Bit(); + bool IsWin64 = Subtarget->isTargetWin64(); + bool IsWindows = Subtarget->isTargetWindows(); + StructReturnType SR = callIsStructReturn(Outs); + bool IsSibcall = false; + + if (MF.getTarget().Options.DisableTailCalls) + isTailCall = false; + + if (isTailCall) { + // Check if it's really possible to do a tail call. + isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, + isVarArg, SR != NotStructReturn, + MF.getFunction()->hasStructRetAttr(), CLI.RetTy, + Outs, OutVals, Ins, DAG); + + // Sibcalls are automatically detected tailcalls which do not require + // ABI changes. + if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) + IsSibcall = true; + + if (isTailCall) + ++NumTailCalls; + } + + assert(!(isVarArg && IsTailCallConvention(CallConv)) && + "Var args not supported with calling convention fastcc, ghc or hipe"); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), + ArgLocs, *DAG.getContext()); + + // Allocate shadow area for Win64 + if (IsWin64) { + CCInfo.AllocateStack(32, 8); + } + + CCInfo.AnalyzeCallOperands(Outs, CC_X86); + + // Get a count of how many bytes are to be pushed on the stack. + unsigned NumBytes = CCInfo.getNextStackOffset(); + if (IsSibcall) + // This is a sibcall. The memory operands are available in caller's + // own caller's stack. + NumBytes = 0; + else if (getTargetMachine().Options.GuaranteedTailCallOpt && + IsTailCallConvention(CallConv)) + NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); + + int FPDiff = 0; + if (isTailCall && !IsSibcall) { + // Lower arguments at fp - stackoffset + fpdiff. + X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); + unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); + + FPDiff = NumBytesCallerPushed - NumBytes; + + // Set the delta of movement of the returnaddr stackslot. + // But only set if delta is greater than previous delta. + if (FPDiff < X86Info->getTCReturnAddrDelta()) + X86Info->setTCReturnAddrDelta(FPDiff); + } + + if (!IsSibcall) + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); + + SDValue RetAddrFrIdx; + // Load return address for tail calls. + if (isTailCall && FPDiff) + Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, + Is64Bit, FPDiff, dl); + + SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; + SmallVector<SDValue, 8> MemOpChains; + SDValue StackPtr; + + // Walk the register/memloc assignments, inserting copies/loads. In the case + // of tail call optimization arguments are handle later. + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + EVT RegVT = VA.getLocVT(); + SDValue Arg = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; + bool isByVal = Flags.isByVal(); + + // Promote the value if needed. + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); + break; + case CCValAssign::AExt: + if (RegVT.is128BitVector()) { + // Special case: passing MMX values in XMM registers. + Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); + Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); + Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); + } else + Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); + break; + case CCValAssign::BCvt: + Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); + break; + case CCValAssign::Indirect: { + // Store the argument. + SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); + int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); + Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, + MachinePointerInfo::getFixedStack(FI), + false, false, 0); + Arg = SpillSlot; + break; + } + } + + if (VA.isRegLoc()) { + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + if (isVarArg && IsWin64) { + // Win64 ABI requires argument XMM reg to be copied to the corresponding + // shadow reg if callee is a varargs function. + unsigned ShadowReg = 0; + switch (VA.getLocReg()) { + case X86::XMM0: ShadowReg = X86::RCX; break; + case X86::XMM1: ShadowReg = X86::RDX; break; + case X86::XMM2: ShadowReg = X86::R8; break; + case X86::XMM3: ShadowReg = X86::R9; break; + } + if (ShadowReg) + RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); + } + } else if (!IsSibcall && (!isTailCall || isByVal)) { + assert(VA.isMemLoc()); + if (StackPtr.getNode() == 0) + StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), + getPointerTy()); + MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, + dl, DAG, VA, Flags)); + } + } + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &MemOpChains[0], MemOpChains.size()); + + if (Subtarget->isPICStyleGOT()) { + // ELF / PIC requires GOT in the EBX register before function calls via PLT + // GOT pointer. + if (!isTailCall) { + RegsToPass.push_back(std::make_pair(unsigned(X86::EBX), + DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()))); + } else { + // If we are tail calling and generating PIC/GOT style code load the + // address of the callee into ECX. The value in ecx is used as target of + // the tail jump. This is done to circumvent the ebx/callee-saved problem + // for tail calls on PIC/GOT architectures. Normally we would just put the + // address of GOT into ebx and then call target@PLT. But for tail calls + // ebx would be restored (since ebx is callee saved) before jumping to the + // target@PLT. + + // Note: The actual moving to ECX is done further down. + GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); + if (G && !G->getGlobal()->hasHiddenVisibility() && + !G->getGlobal()->hasProtectedVisibility()) + Callee = LowerGlobalAddress(Callee, DAG); + else if (isa<ExternalSymbolSDNode>(Callee)) + Callee = LowerExternalSymbol(Callee, DAG); + } + } + + if (Is64Bit && isVarArg && !IsWin64) { + // From AMD64 ABI document: + // For calls that may call functions that use varargs or stdargs + // (prototype-less calls or calls to functions containing ellipsis (...) in + // the declaration) %al is used as hidden argument to specify the number + // of SSE registers used. The contents of %al do not need to match exactly + // the number of registers, but must be an ubound on the number of SSE + // registers used and is in the range 0 - 8 inclusive. + + // Count the number of XMM registers allocated. + static const uint16_t XMMArgRegs[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; + unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); + assert((Subtarget->hasSSE1() || !NumXMMRegs) + && "SSE registers cannot be used when SSE is disabled"); + + RegsToPass.push_back(std::make_pair(unsigned(X86::AL), + DAG.getConstant(NumXMMRegs, MVT::i8))); + } + + // For tail calls lower the arguments to the 'real' stack slot. + if (isTailCall) { + // Force all the incoming stack arguments to be loaded from the stack + // before any new outgoing arguments are stored to the stack, because the + // outgoing stack slots may alias the incoming argument stack slots, and + // the alias isn't otherwise explicit. This is slightly more conservative + // than necessary, because it means that each store effectively depends + // on every argument instead of just those arguments it would clobber. + SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); + + SmallVector<SDValue, 8> MemOpChains2; + SDValue FIN; + int FI = 0; + if (getTargetMachine().Options.GuaranteedTailCallOpt) { + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + if (VA.isRegLoc()) + continue; + assert(VA.isMemLoc()); + SDValue Arg = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; + // Create frame index. + int32_t Offset = VA.getLocMemOffset()+FPDiff; + uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; + FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); + FIN = DAG.getFrameIndex(FI, getPointerTy()); + + if (Flags.isByVal()) { + // Copy relative to framepointer. + SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); + if (StackPtr.getNode() == 0) + StackPtr = DAG.getCopyFromReg(Chain, dl, + RegInfo->getStackRegister(), + getPointerTy()); + Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); + + MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, + ArgChain, + Flags, DAG, dl)); + } else { + // Store relative to framepointer. + MemOpChains2.push_back( + DAG.getStore(ArgChain, dl, Arg, FIN, + MachinePointerInfo::getFixedStack(FI), + false, false, 0)); + } + } + } + + if (!MemOpChains2.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &MemOpChains2[0], MemOpChains2.size()); + + // Store the return address to the appropriate stack slot. + Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, + getPointerTy(), RegInfo->getSlotSize(), + FPDiff, dl); + } + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into registers. + SDValue InFlag; + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + + if (getTargetMachine().getCodeModel() == CodeModel::Large) { + assert(Is64Bit && "Large code model is only legal in 64-bit mode."); + // In the 64-bit large code model, we have to make all calls + // through a register, since the call instruction's 32-bit + // pc-relative offset may not be large enough to hold the whole + // address. + } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + // If the callee is a GlobalAddress node (quite common, every direct call + // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack + // it. + + // We should use extra load for direct calls to dllimported functions in + // non-JIT mode. + const GlobalValue *GV = G->getGlobal(); + if (!GV->hasDLLImportLinkage()) { + unsigned char OpFlags = 0; + bool ExtraLoad = false; + unsigned WrapperKind = ISD::DELETED_NODE; + + // On ELF targets, in both X86-64 and X86-32 mode, direct calls to + // external symbols most go through the PLT in PIC mode. If the symbol + // has hidden or protected visibility, or if it is static or local, then + // we don't need to use the PLT - we can directly call it. + if (Subtarget->isTargetELF() && + getTargetMachine().getRelocationModel() == Reloc::PIC_ && + GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { + OpFlags = X86II::MO_PLT; + } else if (Subtarget->isPICStyleStubAny() && + (GV->isDeclaration() || GV->isWeakForLinker()) && + (!Subtarget->getTargetTriple().isMacOSX() || + Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { + // PC-relative references to external symbols should go through $stub, + // unless we're building with the leopard linker or later, which + // automatically synthesizes these stubs. + OpFlags = X86II::MO_DARWIN_STUB; + } else if (Subtarget->isPICStyleRIPRel() && + isa<Function>(GV) && + cast<Function>(GV)->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, + Attribute::NonLazyBind)) { + // If the function is marked as non-lazy, generate an indirect call + // which loads from the GOT directly. This avoids runtime overhead + // at the cost of eager binding (and one extra byte of encoding). + OpFlags = X86II::MO_GOTPCREL; + WrapperKind = X86ISD::WrapperRIP; + ExtraLoad = true; + } + + Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), + G->getOffset(), OpFlags); + + // Add a wrapper if needed. + if (WrapperKind != ISD::DELETED_NODE) + Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); + // Add extra indirection if needed. + if (ExtraLoad) + Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, + MachinePointerInfo::getGOT(), + false, false, false, 0); + } + } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { + unsigned char OpFlags = 0; + + // On ELF targets, in either X86-64 or X86-32 mode, direct calls to + // external symbols should go through the PLT. + if (Subtarget->isTargetELF() && + getTargetMachine().getRelocationModel() == Reloc::PIC_) { + OpFlags = X86II::MO_PLT; + } else if (Subtarget->isPICStyleStubAny() && + (!Subtarget->getTargetTriple().isMacOSX() || + Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { + // PC-relative references to external symbols should go through $stub, + // unless we're building with the leopard linker or later, which + // automatically synthesizes these stubs. + OpFlags = X86II::MO_DARWIN_STUB; + } + + Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), + OpFlags); + } + + // Returns a chain & a flag for retval copy to use. + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + SmallVector<SDValue, 8> Ops; + + if (!IsSibcall && isTailCall) { + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), + DAG.getIntPtrConstant(0, true), InFlag); + InFlag = Chain.getValue(1); + } + + Ops.push_back(Chain); + Ops.push_back(Callee); + + if (isTailCall) + Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); + + // Add argument registers to the end of the list so that they are known live + // into the call. + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) + Ops.push_back(DAG.getRegister(RegsToPass[i].first, + RegsToPass[i].second.getValueType())); + + // Add a register mask operand representing the call-preserved registers. + const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); + assert(Mask && "Missing call preserved mask for calling convention"); + Ops.push_back(DAG.getRegisterMask(Mask)); + + if (InFlag.getNode()) + Ops.push_back(InFlag); + + if (isTailCall) { + // We used to do: + //// If this is the first return lowered for this function, add the regs + //// to the liveout set for the function. + // This isn't right, although it's probably harmless on x86; liveouts + // should be computed from returns not tail calls. Consider a void + // function making a tail call to a function returning int. + return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size()); + } + + Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); + InFlag = Chain.getValue(1); + + // Create the CALLSEQ_END node. + unsigned NumBytesForCalleeToPush; + if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, + getTargetMachine().Options.GuaranteedTailCallOpt)) + NumBytesForCalleeToPush = NumBytes; // Callee pops everything + else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && + SR == StackStructReturn) + // If this is a call to a struct-return function, the callee + // pops the hidden struct pointer, so we have to push it back. + // This is common for Darwin/X86, Linux & Mingw32 targets. + // For MSVC Win32 targets, the caller pops the hidden struct pointer. + NumBytesForCalleeToPush = 4; + else + NumBytesForCalleeToPush = 0; // Callee pops nothing. + + // Returns a flag for retval copy to use. + if (!IsSibcall) { + Chain = DAG.getCALLSEQ_END(Chain, + DAG.getIntPtrConstant(NumBytes, true), + DAG.getIntPtrConstant(NumBytesForCalleeToPush, + true), + InFlag); + InFlag = Chain.getValue(1); + } + + // Handle result values, copying them out of physregs into vregs that we + // return. + return LowerCallResult(Chain, InFlag, CallConv, isVarArg, + Ins, dl, DAG, InVals); +} + +//===----------------------------------------------------------------------===// +// Fast Calling Convention (tail call) implementation +//===----------------------------------------------------------------------===// + +// Like std call, callee cleans arguments, convention except that ECX is +// reserved for storing the tail called function address. Only 2 registers are +// free for argument passing (inreg). Tail call optimization is performed +// provided: +// * tailcallopt is enabled +// * caller/callee are fastcc +// On X86_64 architecture with GOT-style position independent code only local +// (within module) calls are supported at the moment. +// To keep the stack aligned according to platform abi the function +// GetAlignedArgumentStackSize ensures that argument delta is always multiples +// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) +// If a tail called function callee has more arguments than the caller the +// caller needs to make sure that there is room to move the RETADDR to. This is +// achieved by reserving an area the size of the argument delta right after the +// original REtADDR, but before the saved framepointer or the spilled registers +// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) +// stack layout: +// arg1 +// arg2 +// RETADDR +// [ new RETADDR +// move area ] +// (possible EBP) +// ESI +// EDI +// local1 .. + +/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned +/// for a 16 byte align requirement. +unsigned +X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, + SelectionDAG& DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + const TargetMachine &TM = MF.getTarget(); + const TargetFrameLowering &TFI = *TM.getFrameLowering(); + unsigned StackAlignment = TFI.getStackAlignment(); + uint64_t AlignMask = StackAlignment - 1; + int64_t Offset = StackSize; + unsigned SlotSize = RegInfo->getSlotSize(); + if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { + // Number smaller than 12 so just add the difference. + Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); + } else { + // Mask out lower bits, add stackalignment once plus the 12 bytes. + Offset = ((~AlignMask) & Offset) + StackAlignment + + (StackAlignment-SlotSize); + } + return Offset; +} + +/// MatchingStackOffset - Return true if the given stack call argument is +/// already available in the same position (relatively) of the caller's +/// incoming argument stack. +static +bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, + MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, + const X86InstrInfo *TII) { + unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; + int FI = INT_MAX; + if (Arg.getOpcode() == ISD::CopyFromReg) { + unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); + if (!TargetRegisterInfo::isVirtualRegister(VR)) + return false; + MachineInstr *Def = MRI->getVRegDef(VR); + if (!Def) + return false; + if (!Flags.isByVal()) { + if (!TII->isLoadFromStackSlot(Def, FI)) + return false; + } else { + unsigned Opcode = Def->getOpcode(); + if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && + Def->getOperand(1).isFI()) { + FI = Def->getOperand(1).getIndex(); + Bytes = Flags.getByValSize(); + } else + return false; + } + } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { + if (Flags.isByVal()) + // ByVal argument is passed in as a pointer but it's now being + // dereferenced. e.g. + // define @foo(%struct.X* %A) { + // tail call @bar(%struct.X* byval %A) + // } + return false; + SDValue Ptr = Ld->getBasePtr(); + FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); + if (!FINode) + return false; + FI = FINode->getIndex(); + } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { + FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); + FI = FINode->getIndex(); + Bytes = Flags.getByValSize(); + } else + return false; + + assert(FI != INT_MAX); + if (!MFI->isFixedObjectIndex(FI)) + return false; + return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); +} + +/// IsEligibleForTailCallOptimization - Check whether the call is eligible +/// for tail call optimization. Targets which want to do tail call +/// optimization should implement this function. +bool +X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, + CallingConv::ID CalleeCC, + bool isVarArg, + bool isCalleeStructRet, + bool isCallerStructRet, + Type *RetTy, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + SelectionDAG &DAG) const { + if (!IsTailCallConvention(CalleeCC) && + CalleeCC != CallingConv::C) + return false; + + // If -tailcallopt is specified, make fastcc functions tail-callable. + const MachineFunction &MF = DAG.getMachineFunction(); + const Function *CallerF = DAG.getMachineFunction().getFunction(); + + // If the function return type is x86_fp80 and the callee return type is not, + // then the FP_EXTEND of the call result is not a nop. It's not safe to + // perform a tailcall optimization here. + if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) + return false; + + CallingConv::ID CallerCC = CallerF->getCallingConv(); + bool CCMatch = CallerCC == CalleeCC; + + if (getTargetMachine().Options.GuaranteedTailCallOpt) { + if (IsTailCallConvention(CalleeCC) && CCMatch) + return true; + return false; + } + + // Look for obvious safe cases to perform tail call optimization that do not + // require ABI changes. This is what gcc calls sibcall. + + // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to + // emit a special epilogue. + if (RegInfo->needsStackRealignment(MF)) + return false; + + // Also avoid sibcall optimization if either caller or callee uses struct + // return semantics. + if (isCalleeStructRet || isCallerStructRet) + return false; + + // An stdcall caller is expected to clean up its arguments; the callee + // isn't going to do that. + if (!CCMatch && CallerCC == CallingConv::X86_StdCall) + return false; + + // Do not sibcall optimize vararg calls unless all arguments are passed via + // registers. + if (isVarArg && !Outs.empty()) { + + // Optimizing for varargs on Win64 is unlikely to be safe without + // additional testing. + if (Subtarget->isTargetWin64()) + return false; + + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); + + CCInfo.AnalyzeCallOperands(Outs, CC_X86); + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) + if (!ArgLocs[i].isRegLoc()) + return false; + } + + // If the call result is in ST0 / ST1, it needs to be popped off the x87 + // stack. Therefore, if it's not used by the call it is not safe to optimize + // this into a sibcall. + bool Unused = false; + for (unsigned i = 0, e = Ins.size(); i != e; ++i) { + if (!Ins[i].Used) { + Unused = true; + break; + } + } + if (Unused) { + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); + CCInfo.AnalyzeCallResult(Ins, RetCC_X86); + for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { + CCValAssign &VA = RVLocs[i]; + if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) + return false; + } + } + + // If the calling conventions do not match, then we'd better make sure the + // results are returned in the same way as what the caller expects. + if (!CCMatch) { + SmallVector<CCValAssign, 16> RVLocs1; + CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), + getTargetMachine(), RVLocs1, *DAG.getContext()); + CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); + + SmallVector<CCValAssign, 16> RVLocs2; + CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), + getTargetMachine(), RVLocs2, *DAG.getContext()); + CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); + + if (RVLocs1.size() != RVLocs2.size()) + return false; + for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { + if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) + return false; + if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) + return false; + if (RVLocs1[i].isRegLoc()) { + if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) + return false; + } else { + if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) + return false; + } + } + } + + // If the callee takes no arguments then go on to check the results of the + // call. + if (!Outs.empty()) { + // Check if stack adjustment is needed. For now, do not do this if any + // argument is passed on the stack. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); + + // Allocate shadow area for Win64 + if (Subtarget->isTargetWin64()) { + CCInfo.AllocateStack(32, 8); + } + + CCInfo.AnalyzeCallOperands(Outs, CC_X86); + if (CCInfo.getNextStackOffset()) { + MachineFunction &MF = DAG.getMachineFunction(); + if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) + return false; + + // Check if the arguments are already laid out in the right way as + // the caller's fixed stack objects. + MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + const X86InstrInfo *TII = + ((const X86TargetMachine&)getTargetMachine()).getInstrInfo(); + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + SDValue Arg = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; + if (VA.getLocInfo() == CCValAssign::Indirect) + return false; + if (!VA.isRegLoc()) { + if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, + MFI, MRI, TII)) + return false; + } + } + } + + // If the tailcall address may be in a register, then make sure it's + // possible to register allocate for it. In 32-bit, the call address can + // only target EAX, EDX, or ECX since the tail call must be scheduled after + // callee-saved registers are restored. These happen to be the same + // registers used to pass 'inreg' arguments so watch out for those. + if (!Subtarget->is64Bit() && + ((!isa<GlobalAddressSDNode>(Callee) && + !isa<ExternalSymbolSDNode>(Callee)) || + getTargetMachine().getRelocationModel() == Reloc::PIC_)) { + unsigned NumInRegs = 0; + // In PIC we need an extra register to formulate the address computation + // for the callee. + unsigned MaxInRegs = + (getTargetMachine().getRelocationModel() == Reloc::PIC_) ? 2 : 3; + + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + if (!VA.isRegLoc()) + continue; + unsigned Reg = VA.getLocReg(); + switch (Reg) { + default: break; + case X86::EAX: case X86::EDX: case X86::ECX: + if (++NumInRegs == MaxInRegs) + return false; + break; + } + } + } + } + + return true; +} + +FastISel * +X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) const { + return X86::createFastISel(funcInfo, libInfo); +} + +//===----------------------------------------------------------------------===// +// Other Lowering Hooks +//===----------------------------------------------------------------------===// + +static bool MayFoldLoad(SDValue Op) { + return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); +} + +static bool MayFoldIntoStore(SDValue Op) { + return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); +} + +static bool isTargetShuffle(unsigned Opcode) { + switch(Opcode) { + default: return false; + case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::SHUFP: + case X86ISD::PALIGNR: + case X86ISD::MOVLHPS: + case X86ISD::MOVLHPD: + case X86ISD::MOVHLPS: + case X86ISD::MOVLPS: + case X86ISD::MOVLPD: + case X86ISD::MOVSHDUP: + case X86ISD::MOVSLDUP: + case X86ISD::MOVDDUP: + case X86ISD::MOVSS: + case X86ISD::MOVSD: + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + case X86ISD::VPERMILP: + case X86ISD::VPERM2X128: + case X86ISD::VPERMI: + return true; + } +} + +static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, + SDValue V1, SelectionDAG &DAG) { + switch(Opc) { + default: llvm_unreachable("Unknown x86 shuffle node"); + case X86ISD::MOVSHDUP: + case X86ISD::MOVSLDUP: + case X86ISD::MOVDDUP: + return DAG.getNode(Opc, dl, VT, V1); + } +} + +static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, + SDValue V1, unsigned TargetMask, + SelectionDAG &DAG) { + switch(Opc) { + default: llvm_unreachable("Unknown x86 shuffle node"); + case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::VPERMILP: + case X86ISD::VPERMI: + return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); + } +} + +static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, + SDValue V1, SDValue V2, unsigned TargetMask, + SelectionDAG &DAG) { + switch(Opc) { + default: llvm_unreachable("Unknown x86 shuffle node"); + case X86ISD::PALIGNR: + case X86ISD::SHUFP: + case X86ISD::VPERM2X128: + return DAG.getNode(Opc, dl, VT, V1, V2, + DAG.getConstant(TargetMask, MVT::i8)); + } +} + +static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, + SDValue V1, SDValue V2, SelectionDAG &DAG) { + switch(Opc) { + default: llvm_unreachable("Unknown x86 shuffle node"); + case X86ISD::MOVLHPS: + case X86ISD::MOVLHPD: + case X86ISD::MOVHLPS: + case X86ISD::MOVLPS: + case X86ISD::MOVLPD: + case X86ISD::MOVSS: + case X86ISD::MOVSD: + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + return DAG.getNode(Opc, dl, VT, V1, V2); + } +} + +SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + int ReturnAddrIndex = FuncInfo->getRAIndex(); + + if (ReturnAddrIndex == 0) { + // Set up a frame object for the return address. + unsigned SlotSize = RegInfo->getSlotSize(); + ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, + false); + FuncInfo->setRAIndex(ReturnAddrIndex); + } + + return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); +} + +bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, + bool hasSymbolicDisplacement) { + // Offset should fit into 32 bit immediate field. + if (!isInt<32>(Offset)) + return false; + + // If we don't have a symbolic displacement - we don't have any extra + // restrictions. + if (!hasSymbolicDisplacement) + return true; + + // FIXME: Some tweaks might be needed for medium code model. + if (M != CodeModel::Small && M != CodeModel::Kernel) + return false; + + // For small code model we assume that latest object is 16MB before end of 31 + // bits boundary. We may also accept pretty large negative constants knowing + // that all objects are in the positive half of address space. + if (M == CodeModel::Small && Offset < 16*1024*1024) + return true; + + // For kernel code model we know that all object resist in the negative half + // of 32bits address space. We may not accept negative offsets, since they may + // be just off and we may accept pretty large positive ones. + if (M == CodeModel::Kernel && Offset > 0) + return true; + + return false; +} + +/// isCalleePop - Determines whether the callee is required to pop its +/// own arguments. Callee pop is necessary to support tail calls. +bool X86::isCalleePop(CallingConv::ID CallingConv, + bool is64Bit, bool IsVarArg, bool TailCallOpt) { + if (IsVarArg) + return false; + + switch (CallingConv) { + default: + return false; + case CallingConv::X86_StdCall: + return !is64Bit; + case CallingConv::X86_FastCall: + return !is64Bit; + case CallingConv::X86_ThisCall: + return !is64Bit; + case CallingConv::Fast: + return TailCallOpt; + case CallingConv::GHC: + return TailCallOpt; + case CallingConv::HiPE: + return TailCallOpt; + } +} + +/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 +/// specific condition code, returning the condition code and the LHS/RHS of the +/// comparison to make. +static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, + SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { + if (!isFP) { + if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { + if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { + // X > -1 -> X == 0, jump !sign. + RHS = DAG.getConstant(0, RHS.getValueType()); + return X86::COND_NS; + } + if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { + // X < 0 -> X == 0, jump on sign. + return X86::COND_S; + } + if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { + // X < 1 -> X <= 0 + RHS = DAG.getConstant(0, RHS.getValueType()); + return X86::COND_LE; + } + } + + switch (SetCCOpcode) { + default: llvm_unreachable("Invalid integer condition!"); + case ISD::SETEQ: return X86::COND_E; + case ISD::SETGT: return X86::COND_G; + case ISD::SETGE: return X86::COND_GE; + case ISD::SETLT: return X86::COND_L; + case ISD::SETLE: return X86::COND_LE; + case ISD::SETNE: return X86::COND_NE; + case ISD::SETULT: return X86::COND_B; + case ISD::SETUGT: return X86::COND_A; + case ISD::SETULE: return X86::COND_BE; + case ISD::SETUGE: return X86::COND_AE; + } + } + + // First determine if it is required or is profitable to flip the operands. + + // If LHS is a foldable load, but RHS is not, flip the condition. + if (ISD::isNON_EXTLoad(LHS.getNode()) && + !ISD::isNON_EXTLoad(RHS.getNode())) { + SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); + std::swap(LHS, RHS); + } + + switch (SetCCOpcode) { + default: break; + case ISD::SETOLT: + case ISD::SETOLE: + case ISD::SETUGT: + case ISD::SETUGE: + std::swap(LHS, RHS); + break; + } + + // On a floating point condition, the flags are set as follows: + // ZF PF CF op + // 0 | 0 | 0 | X > Y + // 0 | 0 | 1 | X < Y + // 1 | 0 | 0 | X == Y + // 1 | 1 | 1 | unordered + switch (SetCCOpcode) { + default: llvm_unreachable("Condcode should be pre-legalized away"); + case ISD::SETUEQ: + case ISD::SETEQ: return X86::COND_E; + case ISD::SETOLT: // flipped + case ISD::SETOGT: + case ISD::SETGT: return X86::COND_A; + case ISD::SETOLE: // flipped + case ISD::SETOGE: + case ISD::SETGE: return X86::COND_AE; + case ISD::SETUGT: // flipped + case ISD::SETULT: + case ISD::SETLT: return X86::COND_B; + case ISD::SETUGE: // flipped + case ISD::SETULE: + case ISD::SETLE: return X86::COND_BE; + case ISD::SETONE: + case ISD::SETNE: return X86::COND_NE; + case ISD::SETUO: return X86::COND_P; + case ISD::SETO: return X86::COND_NP; + case ISD::SETOEQ: + case ISD::SETUNE: return X86::COND_INVALID; + } +} + +/// hasFPCMov - is there a floating point cmov for the specific X86 condition +/// code. Current x86 isa includes the following FP cmov instructions: +/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. +static bool hasFPCMov(unsigned X86CC) { + switch (X86CC) { + default: + return false; + case X86::COND_B: + case X86::COND_BE: + case X86::COND_E: + case X86::COND_P: + case X86::COND_A: + case X86::COND_AE: + case X86::COND_NE: + case X86::COND_NP: + return true; + } +} + +/// isFPImmLegal - Returns true if the target can instruction select the +/// specified FP immediate natively. If false, the legalizer will +/// materialize the FP immediate as a load from a constant pool. +bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { + for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { + if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) + return true; + } + return false; +} + +/// isUndefOrInRange - Return true if Val is undef or if its value falls within +/// the specified range (L, H]. +static bool isUndefOrInRange(int Val, int Low, int Hi) { + return (Val < 0) || (Val >= Low && Val < Hi); +} + +/// isUndefOrEqual - Val is either less than zero (undef) or equal to the +/// specified value. +static bool isUndefOrEqual(int Val, int CmpVal) { + return (Val < 0 || Val == CmpVal); +} + +/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning +/// from position Pos and ending in Pos+Size, falls within the specified +/// sequential range (L, L+Pos]. or is undef. +static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, + unsigned Pos, unsigned Size, int Low) { + for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) + if (!isUndefOrEqual(Mask[i], Low)) + return false; + return true; +} + +/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that +/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference +/// the second operand. +static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) { + if (VT == MVT::v4f32 || VT == MVT::v4i32 ) + return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); + if (VT == MVT::v2f64 || VT == MVT::v2i64) + return (Mask[0] < 2 && Mask[1] < 2); + return false; +} + +/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that +/// is suitable for input to PSHUFHW. +static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { + if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) + return false; + + // Lower quadword copied in order or undef. + if (!isSequentialOrUndefInRange(Mask, 0, 4, 0)) + return false; + + // Upper quadword shuffled. + for (unsigned i = 4; i != 8; ++i) + if (!isUndefOrInRange(Mask[i], 4, 8)) + return false; + + if (VT == MVT::v16i16) { + // Lower quadword copied in order or undef. + if (!isSequentialOrUndefInRange(Mask, 8, 4, 8)) + return false; + + // Upper quadword shuffled. + for (unsigned i = 12; i != 16; ++i) + if (!isUndefOrInRange(Mask[i], 12, 16)) + return false; + } + + return true; +} + +/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that +/// is suitable for input to PSHUFLW. +static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { + if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) + return false; + + // Upper quadword copied in order. + if (!isSequentialOrUndefInRange(Mask, 4, 4, 4)) + return false; + + // Lower quadword shuffled. + for (unsigned i = 0; i != 4; ++i) + if (!isUndefOrInRange(Mask[i], 0, 4)) + return false; + + if (VT == MVT::v16i16) { + // Upper quadword copied in order. + if (!isSequentialOrUndefInRange(Mask, 12, 4, 12)) + return false; + + // Lower quadword shuffled. + for (unsigned i = 8; i != 12; ++i) + if (!isUndefOrInRange(Mask[i], 8, 12)) + return false; + } + + return true; +} + +/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that +/// is suitable for input to PALIGNR. +static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT, + const X86Subtarget *Subtarget) { + if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) || + (VT.is256BitVector() && !Subtarget->hasInt256())) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + // Do not handle 64-bit element shuffles with palignr. + if (NumLaneElts == 2) + return false; + + for (unsigned l = 0; l != NumElts; l+=NumLaneElts) { + unsigned i; + for (i = 0; i != NumLaneElts; ++i) { + if (Mask[i+l] >= 0) + break; + } + + // Lane is all undef, go to next lane + if (i == NumLaneElts) + continue; + + int Start = Mask[i+l]; + + // Make sure its in this lane in one of the sources + if (!isUndefOrInRange(Start, l, l+NumLaneElts) && + !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts)) + return false; + + // If not lane 0, then we must match lane 0 + if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l)) + return false; + + // Correct second source to be contiguous with first source + if (Start >= (int)NumElts) + Start -= NumElts - NumLaneElts; + + // Make sure we're shifting in the right direction. + if (Start <= (int)(i+l)) + return false; + + Start -= i; + + // Check the rest of the elements to see if they are consecutive. + for (++i; i != NumLaneElts; ++i) { + int Idx = Mask[i+l]; + + // Make sure its in this lane + if (!isUndefOrInRange(Idx, l, l+NumLaneElts) && + !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts)) + return false; + + // If not lane 0, then we must match lane 0 + if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l)) + return false; + + if (Idx >= (int)NumElts) + Idx -= NumElts - NumLaneElts; + + if (!isUndefOrEqual(Idx, Start+i)) + return false; + + } + } + + return true; +} + +/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming +/// the two vector operands have swapped position. +static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, + unsigned NumElems) { + for (unsigned i = 0; i != NumElems; ++i) { + int idx = Mask[i]; + if (idx < 0) + continue; + else if (idx < (int)NumElems) + Mask[i] = idx + NumElems; + else + Mask[i] = idx - NumElems; + } +} + +/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to 128/256-bit +/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be +/// reverse of what x86 shuffles want. +static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256, + bool Commuted = false) { + if (!HasFp256 && VT.is256BitVector()) + return false; + + unsigned NumElems = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElems = NumElems/NumLanes; + + if (NumLaneElems != 2 && NumLaneElems != 4) + return false; + + // VSHUFPSY divides the resulting vector into 4 chunks. + // The sources are also splitted into 4 chunks, and each destination + // chunk must come from a different source chunk. + // + // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 + // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 + // + // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, + // Y3..Y0, Y3..Y0, X3..X0, X3..X0 + // + // VSHUFPDY divides the resulting vector into 4 chunks. + // The sources are also splitted into 4 chunks, and each destination + // chunk must come from a different source chunk. + // + // SRC1 => X3 X2 X1 X0 + // SRC2 => Y3 Y2 Y1 Y0 + // + // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 + // + unsigned HalfLaneElems = NumLaneElems/2; + for (unsigned l = 0; l != NumElems; l += NumLaneElems) { + for (unsigned i = 0; i != NumLaneElems; ++i) { + int Idx = Mask[i+l]; + unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0); + if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems)) + return false; + // For VSHUFPSY, the mask of the second half must be the same as the + // first but with the appropriate offsets. This works in the same way as + // VPERMILPS works with masks. + if (NumElems != 8 || l == 0 || Mask[i] < 0) + continue; + if (!isUndefOrEqual(Idx, Mask[i]+l)) + return false; + } + } + + return true; +} + +/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to MOVHLPS. +static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) { + if (!VT.is128BitVector()) + return false; + + unsigned NumElems = VT.getVectorNumElements(); + + if (NumElems != 4) + return false; + + // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 + return isUndefOrEqual(Mask[0], 6) && + isUndefOrEqual(Mask[1], 7) && + isUndefOrEqual(Mask[2], 2) && + isUndefOrEqual(Mask[3], 3); +} + +/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form +/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, +/// <2, 3, 2, 3> +static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) { + if (!VT.is128BitVector()) + return false; + + unsigned NumElems = VT.getVectorNumElements(); + + if (NumElems != 4) + return false; + + return isUndefOrEqual(Mask[0], 2) && + isUndefOrEqual(Mask[1], 3) && + isUndefOrEqual(Mask[2], 2) && + isUndefOrEqual(Mask[3], 3); +} + +/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. +static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) { + if (!VT.is128BitVector()) + return false; + + unsigned NumElems = VT.getVectorNumElements(); + + if (NumElems != 2 && NumElems != 4) + return false; + + for (unsigned i = 0, e = NumElems/2; i != e; ++i) + if (!isUndefOrEqual(Mask[i], i + NumElems)) + return false; + + for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) + if (!isUndefOrEqual(Mask[i], i)) + return false; + + return true; +} + +/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to MOVLHPS. +static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) { + if (!VT.is128BitVector()) + return false; + + unsigned NumElems = VT.getVectorNumElements(); + + if (NumElems != 2 && NumElems != 4) + return false; + + for (unsigned i = 0, e = NumElems/2; i != e; ++i) + if (!isUndefOrEqual(Mask[i], i)) + return false; + + for (unsigned i = 0, e = NumElems/2; i != e; ++i) + if (!isUndefOrEqual(Mask[i + e], i + NumElems)) + return false; + + return true; +} + +// +// Some special combinations that can be optimized. +// +static +SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, + SelectionDAG &DAG) { + MVT VT = SVOp->getValueType(0).getSimpleVT(); + DebugLoc dl = SVOp->getDebugLoc(); + + if (VT != MVT::v8i32 && VT != MVT::v8f32) + return SDValue(); + + ArrayRef<int> Mask = SVOp->getMask(); + + // These are the special masks that may be optimized. + static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14}; + static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15}; + bool MatchEvenMask = true; + bool MatchOddMask = true; + for (int i=0; i<8; ++i) { + if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i])) + MatchEvenMask = false; + if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i])) + MatchOddMask = false; + } + + if (!MatchEvenMask && !MatchOddMask) + return SDValue(); + + SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT); + + SDValue Op0 = SVOp->getOperand(0); + SDValue Op1 = SVOp->getOperand(1); + + if (MatchEvenMask) { + // Shift the second operand right to 32 bits. + static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 }; + Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask); + } else { + // Shift the first operand left to 32 bits. + static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 }; + Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask); + } + static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15}; + return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask); +} + +/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to UNPCKL. +static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT, + bool HasInt256, bool V2IsSplat = false) { + unsigned NumElts = VT.getVectorNumElements(); + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for unpckh"); + + if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && + (!HasInt256 || (NumElts != 16 && NumElts != 32))) + return false; + + // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate + // independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + for (unsigned l = 0; l != NumLanes; ++l) { + for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; + i != (l+1)*NumLaneElts; + i += 2, ++j) { + int BitI = Mask[i]; + int BitI1 = Mask[i+1]; + if (!isUndefOrEqual(BitI, j)) + return false; + if (V2IsSplat) { + if (!isUndefOrEqual(BitI1, NumElts)) + return false; + } else { + if (!isUndefOrEqual(BitI1, j + NumElts)) + return false; + } + } + } + + return true; +} + +/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to UNPCKH. +static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT, + bool HasInt256, bool V2IsSplat = false) { + unsigned NumElts = VT.getVectorNumElements(); + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for unpckh"); + + if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && + (!HasInt256 || (NumElts != 16 && NumElts != 32))) + return false; + + // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate + // independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + for (unsigned l = 0; l != NumLanes; ++l) { + for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; + i != (l+1)*NumLaneElts; i += 2, ++j) { + int BitI = Mask[i]; + int BitI1 = Mask[i+1]; + if (!isUndefOrEqual(BitI, j)) + return false; + if (V2IsSplat) { + if (isUndefOrEqual(BitI1, NumElts)) + return false; + } else { + if (!isUndefOrEqual(BitI1, j+NumElts)) + return false; + } + } + } + return true; +} + +/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form +/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, +/// <0, 0, 1, 1> +static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { + unsigned NumElts = VT.getVectorNumElements(); + bool Is256BitVec = VT.is256BitVector(); + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for unpckh"); + + if (Is256BitVec && NumElts != 4 && NumElts != 8 && + (!HasInt256 || (NumElts != 16 && NumElts != 32))) + return false; + + // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern + // FIXME: Need a better way to get rid of this, there's no latency difference + // between UNPCKLPD and MOVDDUP, the later should always be checked first and + // the former later. We should also remove the "_undef" special mask. + if (NumElts == 4 && Is256BitVec) + return false; + + // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate + // independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + for (unsigned l = 0; l != NumLanes; ++l) { + for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; + i != (l+1)*NumLaneElts; + i += 2, ++j) { + int BitI = Mask[i]; + int BitI1 = Mask[i+1]; + + if (!isUndefOrEqual(BitI, j)) + return false; + if (!isUndefOrEqual(BitI1, j)) + return false; + } + } + + return true; +} + +/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form +/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, +/// <2, 2, 3, 3> +static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { + unsigned NumElts = VT.getVectorNumElements(); + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for unpckh"); + + if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && + (!HasInt256 || (NumElts != 16 && NumElts != 32))) + return false; + + // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate + // independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + for (unsigned l = 0; l != NumLanes; ++l) { + for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; + i != (l+1)*NumLaneElts; i += 2, ++j) { + int BitI = Mask[i]; + int BitI1 = Mask[i+1]; + if (!isUndefOrEqual(BitI, j)) + return false; + if (!isUndefOrEqual(BitI1, j)) + return false; + } + } + return true; +} + +/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to MOVSS, +/// MOVSD, and MOVD, i.e. setting the lowest element. +static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) { + if (VT.getVectorElementType().getSizeInBits() < 32) + return false; + if (!VT.is128BitVector()) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + + if (!isUndefOrEqual(Mask[0], NumElts)) + return false; + + for (unsigned i = 1; i != NumElts; ++i) + if (!isUndefOrEqual(Mask[i], i)) + return false; + + return true; +} + +/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered +/// as permutations between 128-bit chunks or halves. As an example: this +/// shuffle bellow: +/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> +/// The first half comes from the second half of V1 and the second half from the +/// the second half of V2. +static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { + if (!HasFp256 || !VT.is256BitVector()) + return false; + + // The shuffle result is divided into half A and half B. In total the two + // sources have 4 halves, namely: C, D, E, F. The final values of A and + // B must come from C, D, E or F. + unsigned HalfSize = VT.getVectorNumElements()/2; + bool MatchA = false, MatchB = false; + + // Check if A comes from one of C, D, E, F. + for (unsigned Half = 0; Half != 4; ++Half) { + if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) { + MatchA = true; + break; + } + } + + // Check if B comes from one of C, D, E, F. + for (unsigned Half = 0; Half != 4; ++Half) { + if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) { + MatchB = true; + break; + } + } + + return MatchA && MatchB; +} + +/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. +static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { + MVT VT = SVOp->getValueType(0).getSimpleVT(); + + unsigned HalfSize = VT.getVectorNumElements()/2; + + unsigned FstHalf = 0, SndHalf = 0; + for (unsigned i = 0; i < HalfSize; ++i) { + if (SVOp->getMaskElt(i) > 0) { + FstHalf = SVOp->getMaskElt(i)/HalfSize; + break; + } + } + for (unsigned i = HalfSize; i < HalfSize*2; ++i) { + if (SVOp->getMaskElt(i) > 0) { + SndHalf = SVOp->getMaskElt(i)/HalfSize; + break; + } + } + + return (FstHalf | (SndHalf << 4)); +} + +/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to VPERMILPD*. +/// Note that VPERMIL mask matching is different depending whether theunderlying +/// type is 32 or 64. In the VPERMILPS the high half of the mask should point +/// to the same elements of the low, but to the higher half of the source. +/// In VPERMILPD the two lanes could be shuffled independently of each other +/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY. +static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { + if (!HasFp256) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + // Only match 256-bit with 32/64-bit types + if (!VT.is256BitVector() || (NumElts != 4 && NumElts != 8)) + return false; + + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned LaneSize = NumElts/NumLanes; + for (unsigned l = 0; l != NumElts; l += LaneSize) { + for (unsigned i = 0; i != LaneSize; ++i) { + if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) + return false; + if (NumElts != 8 || l == 0) + continue; + // VPERMILPS handling + if (Mask[i] < 0) + continue; + if (!isUndefOrEqual(Mask[i+l], Mask[i]+l)) + return false; + } + } + + return true; +} + +/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse +/// of what x86 movss want. X86 movs requires the lowest element to be lowest +/// element of vector 2 and the other elements to come from vector 1 in order. +static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT, + bool V2IsSplat = false, bool V2IsUndef = false) { + if (!VT.is128BitVector()) + return false; + + unsigned NumOps = VT.getVectorNumElements(); + if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) + return false; + + if (!isUndefOrEqual(Mask[0], 0)) + return false; + + for (unsigned i = 1; i != NumOps; ++i) + if (!(isUndefOrEqual(Mask[i], i+NumOps) || + (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || + (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) + return false; + + return true; +} + +/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. +/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> +static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT, + const X86Subtarget *Subtarget) { + if (!Subtarget->hasSSE3()) + return false; + + unsigned NumElems = VT.getVectorNumElements(); + + if ((VT.is128BitVector() && NumElems != 4) || + (VT.is256BitVector() && NumElems != 8)) + return false; + + // "i+1" is the value the indexed mask element must have + for (unsigned i = 0; i != NumElems; i += 2) + if (!isUndefOrEqual(Mask[i], i+1) || + !isUndefOrEqual(Mask[i+1], i+1)) + return false; + + return true; +} + +/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. +/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> +static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT, + const X86Subtarget *Subtarget) { + if (!Subtarget->hasSSE3()) + return false; + + unsigned NumElems = VT.getVectorNumElements(); + + if ((VT.is128BitVector() && NumElems != 4) || + (VT.is256BitVector() && NumElems != 8)) + return false; + + // "i" is the value the indexed mask element must have + for (unsigned i = 0; i != NumElems; i += 2) + if (!isUndefOrEqual(Mask[i], i) || + !isUndefOrEqual(Mask[i+1], i)) + return false; + + return true; +} + +/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to 256-bit +/// version of MOVDDUP. +static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { + if (!HasFp256 || !VT.is256BitVector()) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + if (NumElts != 4) + return false; + + for (unsigned i = 0; i != NumElts/2; ++i) + if (!isUndefOrEqual(Mask[i], 0)) + return false; + for (unsigned i = NumElts/2; i != NumElts; ++i) + if (!isUndefOrEqual(Mask[i], NumElts/2)) + return false; + return true; +} + +/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to 128-bit +/// version of MOVDDUP. +static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) { + if (!VT.is128BitVector()) + return false; + + unsigned e = VT.getVectorNumElements() / 2; + for (unsigned i = 0; i != e; ++i) + if (!isUndefOrEqual(Mask[i], i)) + return false; + for (unsigned i = 0; i != e; ++i) + if (!isUndefOrEqual(Mask[e+i], i)) + return false; + return true; +} + +/// isVEXTRACTF128Index - Return true if the specified +/// EXTRACT_SUBVECTOR operand specifies a vector extract that is +/// suitable for input to VEXTRACTF128. +bool X86::isVEXTRACTF128Index(SDNode *N) { + if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) + return false; + + // The index should be aligned on a 128-bit boundary. + uint64_t Index = + cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); + + MVT VT = N->getValueType(0).getSimpleVT(); + unsigned ElSize = VT.getVectorElementType().getSizeInBits(); + bool Result = (Index * ElSize) % 128 == 0; + + return Result; +} + +/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR +/// operand specifies a subvector insert that is suitable for input to +/// VINSERTF128. +bool X86::isVINSERTF128Index(SDNode *N) { + if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) + return false; + + // The index should be aligned on a 128-bit boundary. + uint64_t Index = + cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); + + MVT VT = N->getValueType(0).getSimpleVT(); + unsigned ElSize = VT.getVectorElementType().getSizeInBits(); + bool Result = (Index * ElSize) % 128 == 0; + + return Result; +} + +/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. +/// Handles 128-bit and 256-bit. +static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { + MVT VT = N->getValueType(0).getSimpleVT(); + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for PSHUF/SHUFP"); + + // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate + // independently on 128-bit lanes. + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + assert((NumLaneElts == 2 || NumLaneElts == 4) && + "Only supports 2 or 4 elements per lane"); + + unsigned Shift = (NumLaneElts == 4) ? 1 : 0; + unsigned Mask = 0; + for (unsigned i = 0; i != NumElts; ++i) { + int Elt = N->getMaskElt(i); + if (Elt < 0) continue; + Elt &= NumLaneElts - 1; + unsigned ShAmt = (i << Shift) % 8; + Mask |= Elt << ShAmt; + } + + return Mask; +} + +/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. +static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { + MVT VT = N->getValueType(0).getSimpleVT(); + + assert((VT == MVT::v8i16 || VT == MVT::v16i16) && + "Unsupported vector type for PSHUFHW"); + + unsigned NumElts = VT.getVectorNumElements(); + + unsigned Mask = 0; + for (unsigned l = 0; l != NumElts; l += 8) { + // 8 nodes per lane, but we only care about the last 4. + for (unsigned i = 0; i < 4; ++i) { + int Elt = N->getMaskElt(l+i+4); + if (Elt < 0) continue; + Elt &= 0x3; // only 2-bits. + Mask |= Elt << (i * 2); + } + } + + return Mask; +} + +/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. +static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { + MVT VT = N->getValueType(0).getSimpleVT(); + + assert((VT == MVT::v8i16 || VT == MVT::v16i16) && + "Unsupported vector type for PSHUFHW"); + + unsigned NumElts = VT.getVectorNumElements(); + + unsigned Mask = 0; + for (unsigned l = 0; l != NumElts; l += 8) { + // 8 nodes per lane, but we only care about the first 4. + for (unsigned i = 0; i < 4; ++i) { + int Elt = N->getMaskElt(l+i); + if (Elt < 0) continue; + Elt &= 0x3; // only 2-bits + Mask |= Elt << (i * 2); + } + } + + return Mask; +} + +/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. +static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { + MVT VT = SVOp->getValueType(0).getSimpleVT(); + unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3; + + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + int Val = 0; + unsigned i; + for (i = 0; i != NumElts; ++i) { + Val = SVOp->getMaskElt(i); + if (Val >= 0) + break; + } + if (Val >= (int)NumElts) + Val -= NumElts - NumLaneElts; + + assert(Val - i > 0 && "PALIGNR imm should be positive"); + return (Val - i) * EltSize; +} + +/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate +/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 +/// instructions. +unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { + if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) + llvm_unreachable("Illegal extract subvector for VEXTRACTF128"); + + uint64_t Index = + cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); + + MVT VecVT = N->getOperand(0).getValueType().getSimpleVT(); + MVT ElVT = VecVT.getVectorElementType(); + + unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); + return Index / NumElemsPerChunk; +} + +/// getInsertVINSERTF128Immediate - Return the appropriate immediate +/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 +/// instructions. +unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { + if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) + llvm_unreachable("Illegal insert subvector for VINSERTF128"); + + uint64_t Index = + cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); + + MVT VecVT = N->getValueType(0).getSimpleVT(); + MVT ElVT = VecVT.getVectorElementType(); + + unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); + return Index / NumElemsPerChunk; +} + +/// getShuffleCLImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions. +/// Handles 256-bit. +static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) { + MVT VT = N->getValueType(0).getSimpleVT(); + + unsigned NumElts = VT.getVectorNumElements(); + + assert((VT.is256BitVector() && NumElts == 4) && + "Unsupported vector type for VPERMQ/VPERMPD"); + + unsigned Mask = 0; + for (unsigned i = 0; i != NumElts; ++i) { + int Elt = N->getMaskElt(i); + if (Elt < 0) + continue; + Mask |= Elt << (i*2); + } + + return Mask; +} +/// isZeroNode - Returns true if Elt is a constant zero or a floating point +/// constant +0.0. +bool X86::isZeroNode(SDValue Elt) { + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt)) + return CN->isNullValue(); + if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt)) + return CFP->getValueAPF().isPosZero(); + return false; +} + +/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in +/// their permute mask. +static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, + SelectionDAG &DAG) { + MVT VT = SVOp->getValueType(0).getSimpleVT(); + unsigned NumElems = VT.getVectorNumElements(); + SmallVector<int, 8> MaskVec; + + for (unsigned i = 0; i != NumElems; ++i) { + int Idx = SVOp->getMaskElt(i); + if (Idx >= 0) { + if (Idx < (int)NumElems) + Idx += NumElems; + else + Idx -= NumElems; + } + MaskVec.push_back(Idx); + } + return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), + SVOp->getOperand(0), &MaskVec[0]); +} + +/// ShouldXformToMOVHLPS - Return true if the node should be transformed to +/// match movhlps. The lower half elements should come from upper half of +/// V1 (and in order), and the upper half elements should come from the upper +/// half of V2 (and in order). +static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) { + if (!VT.is128BitVector()) + return false; + if (VT.getVectorNumElements() != 4) + return false; + for (unsigned i = 0, e = 2; i != e; ++i) + if (!isUndefOrEqual(Mask[i], i+2)) + return false; + for (unsigned i = 2; i != 4; ++i) + if (!isUndefOrEqual(Mask[i], i+4)) + return false; + return true; +} + +/// isScalarLoadToVector - Returns true if the node is a scalar load that +/// is promoted to a vector. It also returns the LoadSDNode by reference if +/// required. +static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { + if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) + return false; + N = N->getOperand(0).getNode(); + if (!ISD::isNON_EXTLoad(N)) + return false; + if (LD) + *LD = cast<LoadSDNode>(N); + return true; +} + +// Test whether the given value is a vector value which will be legalized +// into a load. +static bool WillBeConstantPoolLoad(SDNode *N) { + if (N->getOpcode() != ISD::BUILD_VECTOR) + return false; + + // Check for any non-constant elements. + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) + switch (N->getOperand(i).getNode()->getOpcode()) { + case ISD::UNDEF: + case ISD::ConstantFP: + case ISD::Constant: + break; + default: + return false; + } + + // Vectors of all-zeros and all-ones are materialized with special + // instructions rather than being loaded. + return !ISD::isBuildVectorAllZeros(N) && + !ISD::isBuildVectorAllOnes(N); +} + +/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to +/// match movlp{s|d}. The lower half elements should come from lower half of +/// V1 (and in order), and the upper half elements should come from the upper +/// half of V2 (and in order). And since V1 will become the source of the +/// MOVLP, it must be either a vector load or a scalar load to vector. +static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, + ArrayRef<int> Mask, EVT VT) { + if (!VT.is128BitVector()) + return false; + + if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) + return false; + // Is V2 is a vector load, don't do this transformation. We will try to use + // load folding shufps op. + if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2)) + return false; + + unsigned NumElems = VT.getVectorNumElements(); + + if (NumElems != 2 && NumElems != 4) + return false; + for (unsigned i = 0, e = NumElems/2; i != e; ++i) + if (!isUndefOrEqual(Mask[i], i)) + return false; + for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) + if (!isUndefOrEqual(Mask[i], i+NumElems)) + return false; + return true; +} + +/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are +/// all the same. +static bool isSplatVector(SDNode *N) { + if (N->getOpcode() != ISD::BUILD_VECTOR) + return false; + + SDValue SplatValue = N->getOperand(0); + for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) + if (N->getOperand(i) != SplatValue) + return false; + return true; +} + +/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved +/// to an zero vector. +/// FIXME: move to dag combiner / method on ShuffleVectorSDNode +static bool isZeroShuffle(ShuffleVectorSDNode *N) { + SDValue V1 = N->getOperand(0); + SDValue V2 = N->getOperand(1); + unsigned NumElems = N->getValueType(0).getVectorNumElements(); + for (unsigned i = 0; i != NumElems; ++i) { + int Idx = N->getMaskElt(i); + if (Idx >= (int)NumElems) { + unsigned Opc = V2.getOpcode(); + if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) + continue; + if (Opc != ISD::BUILD_VECTOR || + !X86::isZeroNode(V2.getOperand(Idx-NumElems))) + return false; + } else if (Idx >= 0) { + unsigned Opc = V1.getOpcode(); + if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) + continue; + if (Opc != ISD::BUILD_VECTOR || + !X86::isZeroNode(V1.getOperand(Idx))) + return false; + } + } + return true; +} + +/// getZeroVector - Returns a vector of specified type with all zero elements. +/// +static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG, DebugLoc dl) { + assert(VT.isVector() && "Expected a vector type"); + + // Always build SSE zero vectors as <4 x i32> bitcasted + // to their dest type. This ensures they get CSE'd. + SDValue Vec; + if (VT.is128BitVector()) { // SSE + if (Subtarget->hasSSE2()) { // SSE2 + SDValue Cst = DAG.getTargetConstant(0, MVT::i32); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); + } else { // SSE1 + SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); + } + } else if (VT.is256BitVector()) { // AVX + if (Subtarget->hasInt256()) { // AVX2 + SDValue Cst = DAG.getTargetConstant(0, MVT::i32); + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, + array_lengthof(Ops)); + } else { + // 256-bit logic and arithmetic instructions in AVX are all + // floating-point, no support for integer ops. Emit fp zeroed vectors. + SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, + array_lengthof(Ops)); + } + } else + llvm_unreachable("Unexpected vector type"); + + return DAG.getNode(ISD::BITCAST, dl, VT, Vec); +} + +/// getOnesVector - Returns a vector of specified type with all bits set. +/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with +/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. +/// Then bitcast to their original type, ensuring they get CSE'd. +static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, + DebugLoc dl) { + assert(VT.isVector() && "Expected a vector type"); + + SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); + SDValue Vec; + if (VT.is256BitVector()) { + if (HasInt256) { // AVX2 + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, + array_lengthof(Ops)); + } else { // AVX + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); + Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); + } + } else if (VT.is128BitVector()) { + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); + } else + llvm_unreachable("Unexpected vector type"); + + return DAG.getNode(ISD::BITCAST, dl, VT, Vec); +} + +/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements +/// that point to V2 points to its first element. +static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) { + for (unsigned i = 0; i != NumElems; ++i) { + if (Mask[i] > (int)NumElems) { + Mask[i] = NumElems; + } + } +} + +/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd +/// operation of specified width. +static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, + SDValue V2) { + unsigned NumElems = VT.getVectorNumElements(); + SmallVector<int, 8> Mask; + Mask.push_back(NumElems); + for (unsigned i = 1; i != NumElems; ++i) + Mask.push_back(i); + return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); +} + +/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. +static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, + SDValue V2) { + unsigned NumElems = VT.getVectorNumElements(); + SmallVector<int, 8> Mask; + for (unsigned i = 0, e = NumElems/2; i != e; ++i) { + Mask.push_back(i); + Mask.push_back(i + NumElems); + } + return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); +} + +/// getUnpackh - Returns a vector_shuffle node for an unpackh operation. +static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, + SDValue V2) { + unsigned NumElems = VT.getVectorNumElements(); + SmallVector<int, 8> Mask; + for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) { + Mask.push_back(i + Half); + Mask.push_back(i + NumElems + Half); + } + return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); +} + +// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by +// a generic shuffle instruction because the target has no such instructions. +// Generate shuffles which repeat i16 and i8 several times until they can be +// represented by v4f32 and then be manipulated by target suported shuffles. +static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { + EVT VT = V.getValueType(); + int NumElems = VT.getVectorNumElements(); + DebugLoc dl = V.getDebugLoc(); + + while (NumElems > 4) { + if (EltNo < NumElems/2) { + V = getUnpackl(DAG, dl, VT, V, V); + } else { + V = getUnpackh(DAG, dl, VT, V, V); + EltNo -= NumElems/2; + } + NumElems >>= 1; + } + return V; +} + +/// getLegalSplat - Generate a legal splat with supported x86 shuffles +static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { + EVT VT = V.getValueType(); + DebugLoc dl = V.getDebugLoc(); + + if (VT.is128BitVector()) { + V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V); + int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; + V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32), + &SplatMask[0]); + } else if (VT.is256BitVector()) { + // To use VPERMILPS to splat scalars, the second half of indicies must + // refer to the higher part, which is a duplication of the lower one, + // because VPERMILPS can only handle in-lane permutations. + int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo, + EltNo+4, EltNo+4, EltNo+4, EltNo+4 }; + + V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V); + V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32), + &SplatMask[0]); + } else + llvm_unreachable("Vector size not supported"); + + return DAG.getNode(ISD::BITCAST, dl, VT, V); +} + +/// PromoteSplat - Splat is promoted to target supported vector shuffles. +static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { + EVT SrcVT = SV->getValueType(0); + SDValue V1 = SV->getOperand(0); + DebugLoc dl = SV->getDebugLoc(); + + int EltNo = SV->getSplatIndex(); + int NumElems = SrcVT.getVectorNumElements(); + bool Is256BitVec = SrcVT.is256BitVector(); + + assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) && + "Unknown how to promote splat for type"); + + // Extract the 128-bit part containing the splat element and update + // the splat element index when it refers to the higher register. + if (Is256BitVec) { + V1 = Extract128BitVector(V1, EltNo, DAG, dl); + if (EltNo >= NumElems/2) + EltNo -= NumElems/2; + } + + // All i16 and i8 vector types can't be used directly by a generic shuffle + // instruction because the target has no such instruction. Generate shuffles + // which repeat i16 and i8 several times until they fit in i32, and then can + // be manipulated by target suported shuffles. + EVT EltVT = SrcVT.getVectorElementType(); + if (EltVT == MVT::i8 || EltVT == MVT::i16) + V1 = PromoteSplati8i16(V1, DAG, EltNo); + + // Recreate the 256-bit vector and place the same 128-bit vector + // into the low and high part. This is necessary because we want + // to use VPERM* to shuffle the vectors + if (Is256BitVec) { + V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1); + } + + return getLegalSplat(DAG, V1, EltNo); +} + +/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified +/// vector of zero or undef vector. This produces a shuffle where the low +/// element of V2 is swizzled into the zero/undef vector, landing at element +/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). +static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, + bool IsZero, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + EVT VT = V2.getValueType(); + SDValue V1 = IsZero + ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); + unsigned NumElems = VT.getVectorNumElements(); + SmallVector<int, 16> MaskVec; + for (unsigned i = 0; i != NumElems; ++i) + // If this is the insertion idx, put the low elt of V2 here. + MaskVec.push_back(i == Idx ? NumElems : i); + return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); +} + +/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the +/// target specific opcode. Returns true if the Mask could be calculated. +/// Sets IsUnary to true if only uses one source. +static bool getTargetShuffleMask(SDNode *N, MVT VT, + SmallVectorImpl<int> &Mask, bool &IsUnary) { + unsigned NumElems = VT.getVectorNumElements(); + SDValue ImmN; + + IsUnary = false; + switch(N->getOpcode()) { + case X86ISD::SHUFP: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + break; + case X86ISD::UNPCKH: + DecodeUNPCKHMask(VT, Mask); + break; + case X86ISD::UNPCKL: + DecodeUNPCKLMask(VT, Mask); + break; + case X86ISD::MOVHLPS: + DecodeMOVHLPSMask(NumElems, Mask); + break; + case X86ISD::MOVLHPS: + DecodeMOVLHPSMask(NumElems, Mask); + break; + case X86ISD::PALIGNR: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + break; + case X86ISD::PSHUFD: + case X86ISD::VPERMILP: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = true; + break; + case X86ISD::PSHUFHW: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = true; + break; + case X86ISD::PSHUFLW: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = true; + break; + case X86ISD::VPERMI: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = true; + break; + case X86ISD::MOVSS: + case X86ISD::MOVSD: { + // The index 0 always comes from the first element of the second source, + // this is why MOVSS and MOVSD are used in the first place. The other + // elements come from the other positions of the first source vector + Mask.push_back(NumElems); + for (unsigned i = 1; i != NumElems; ++i) { + Mask.push_back(i); + } + break; + } + case X86ISD::VPERM2X128: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + if (Mask.empty()) return false; + break; + case X86ISD::MOVDDUP: + case X86ISD::MOVLHPD: + case X86ISD::MOVLPD: + case X86ISD::MOVLPS: + case X86ISD::MOVSHDUP: + case X86ISD::MOVSLDUP: + // Not yet implemented + return false; + default: llvm_unreachable("unknown target shuffle node"); + } + + return true; +} + +/// getShuffleScalarElt - Returns the scalar element that will make up the ith +/// element of the result of the vector shuffle. +static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, + unsigned Depth) { + if (Depth == 6) + return SDValue(); // Limit search depth. + + SDValue V = SDValue(N, 0); + EVT VT = V.getValueType(); + unsigned Opcode = V.getOpcode(); + + // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. + if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { + int Elt = SV->getMaskElt(Index); + + if (Elt < 0) + return DAG.getUNDEF(VT.getVectorElementType()); + + unsigned NumElems = VT.getVectorNumElements(); + SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) + : SV->getOperand(1); + return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); + } + + // Recurse into target specific vector shuffles to find scalars. + if (isTargetShuffle(Opcode)) { + MVT ShufVT = V.getValueType().getSimpleVT(); + unsigned NumElems = ShufVT.getVectorNumElements(); + SmallVector<int, 16> ShuffleMask; + bool IsUnary; + + if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) + return SDValue(); + + int Elt = ShuffleMask[Index]; + if (Elt < 0) + return DAG.getUNDEF(ShufVT.getVectorElementType()); + + SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) + : N->getOperand(1); + return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, + Depth+1); + } + + // Actual nodes that may contain scalar elements + if (Opcode == ISD::BITCAST) { + V = V.getOperand(0); + EVT SrcVT = V.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) + return SDValue(); + } + + if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) + return (Index == 0) ? V.getOperand(0) + : DAG.getUNDEF(VT.getVectorElementType()); + + if (V.getOpcode() == ISD::BUILD_VECTOR) + return V.getOperand(Index); + + return SDValue(); +} + +/// getNumOfConsecutiveZeros - Return the number of elements of a vector +/// shuffle operation which come from a consecutively from a zero. The +/// search can start in two different directions, from left or right. +static +unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems, + bool ZerosFromLeft, SelectionDAG &DAG) { + unsigned i; + for (i = 0; i != NumElems; ++i) { + unsigned Index = ZerosFromLeft ? i : NumElems-i-1; + SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0); + if (!(Elt.getNode() && + (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) + break; + } + + return i; +} + +/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE) +/// correspond consecutively to elements from one of the vector operands, +/// starting from its index OpIdx. Also tell OpNum which source vector operand. +static +bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, + unsigned MaskI, unsigned MaskE, unsigned OpIdx, + unsigned NumElems, unsigned &OpNum) { + bool SeenV1 = false; + bool SeenV2 = false; + + for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) { + int Idx = SVOp->getMaskElt(i); + // Ignore undef indicies + if (Idx < 0) + continue; + + if (Idx < (int)NumElems) + SeenV1 = true; + else + SeenV2 = true; + + // Only accept consecutive elements from the same vector + if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) + return false; + } + + OpNum = SeenV1 ? 0 : 1; + return true; +} + +/// isVectorShiftRight - Returns true if the shuffle can be implemented as a +/// logical left shift of a vector. +static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, + bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { + unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); + unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, + false /* check zeros from right */, DAG); + unsigned OpSrc; + + if (!NumZeros) + return false; + + // Considering the elements in the mask that are not consecutive zeros, + // check if they consecutively come from only one of the source vectors. + // + // V1 = {X, A, B, C} 0 + // \ \ \ / + // vector_shuffle V1, V2 <1, 2, 3, X> + // + if (!isShuffleMaskConsecutive(SVOp, + 0, // Mask Start Index + NumElems-NumZeros, // Mask End Index(exclusive) + NumZeros, // Where to start looking in the src vector + NumElems, // Number of elements in vector + OpSrc)) // Which source operand ? + return false; + + isLeft = false; + ShAmt = NumZeros; + ShVal = SVOp->getOperand(OpSrc); + return true; +} + +/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a +/// logical left shift of a vector. +static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, + bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { + unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); + unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, + true /* check zeros from left */, DAG); + unsigned OpSrc; + + if (!NumZeros) + return false; + + // Considering the elements in the mask that are not consecutive zeros, + // check if they consecutively come from only one of the source vectors. + // + // 0 { A, B, X, X } = V2 + // / \ / / + // vector_shuffle V1, V2 <X, X, 4, 5> + // + if (!isShuffleMaskConsecutive(SVOp, + NumZeros, // Mask Start Index + NumElems, // Mask End Index(exclusive) + 0, // Where to start looking in the src vector + NumElems, // Number of elements in vector + OpSrc)) // Which source operand ? + return false; + + isLeft = true; + ShAmt = NumZeros; + ShVal = SVOp->getOperand(OpSrc); + return true; +} + +/// isVectorShift - Returns true if the shuffle can be implemented as a +/// logical left or right shift of a vector. +static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, + bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { + // Although the logic below support any bitwidth size, there are no + // shift instructions which handle more than 128-bit vectors. + if (!SVOp->getValueType(0).is128BitVector()) + return false; + + if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || + isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) + return true; + + return false; +} + +/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. +/// +static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, + unsigned NumNonZero, unsigned NumZero, + SelectionDAG &DAG, + const X86Subtarget* Subtarget, + const TargetLowering &TLI) { + if (NumNonZero > 8) + return SDValue(); + + DebugLoc dl = Op.getDebugLoc(); + SDValue V(0, 0); + bool First = true; + for (unsigned i = 0; i < 16; ++i) { + bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; + if (ThisIsNonZero && First) { + if (NumZero) + V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); + else + V = DAG.getUNDEF(MVT::v8i16); + First = false; + } + + if ((i & 1) != 0) { + SDValue ThisElt(0, 0), LastElt(0, 0); + bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; + if (LastIsNonZero) { + LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, + MVT::i16, Op.getOperand(i-1)); + } + if (ThisIsNonZero) { + ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); + ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, + ThisElt, DAG.getConstant(8, MVT::i8)); + if (LastIsNonZero) + ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); + } else + ThisElt = LastElt; + + if (ThisElt.getNode()) + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, + DAG.getIntPtrConstant(i/2)); + } + } + + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); +} + +/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. +/// +static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, + unsigned NumNonZero, unsigned NumZero, + SelectionDAG &DAG, + const X86Subtarget* Subtarget, + const TargetLowering &TLI) { + if (NumNonZero > 4) + return SDValue(); + + DebugLoc dl = Op.getDebugLoc(); + SDValue V(0, 0); + bool First = true; + for (unsigned i = 0; i < 8; ++i) { + bool isNonZero = (NonZeros & (1 << i)) != 0; + if (isNonZero) { + if (First) { + if (NumZero) + V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); + else + V = DAG.getUNDEF(MVT::v8i16); + First = false; + } + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, + MVT::v8i16, V, Op.getOperand(i), + DAG.getIntPtrConstant(i)); + } + } + + return V; +} + +/// getVShift - Return a vector logical shift node. +/// +static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, + unsigned NumBits, SelectionDAG &DAG, + const TargetLowering &TLI, DebugLoc dl) { + assert(VT.is128BitVector() && "Unknown type for VShift"); + EVT ShVT = MVT::v2i64; + unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; + SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getNode(Opc, dl, ShVT, SrcOp, + DAG.getConstant(NumBits, + TLI.getScalarShiftAmountTy(SrcOp.getValueType())))); +} + +SDValue +X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, + SelectionDAG &DAG) const { + + // Check if the scalar load can be widened into a vector load. And if + // the address is "base + cst" see if the cst can be "absorbed" into + // the shuffle mask. + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { + SDValue Ptr = LD->getBasePtr(); + if (!ISD::isNormalLoad(LD) || LD->isVolatile()) + return SDValue(); + EVT PVT = LD->getValueType(0); + if (PVT != MVT::i32 && PVT != MVT::f32) + return SDValue(); + + int FI = -1; + int64_t Offset = 0; + if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { + FI = FINode->getIndex(); + Offset = 0; + } else if (DAG.isBaseWithConstantOffset(Ptr) && + isa<FrameIndexSDNode>(Ptr.getOperand(0))) { + FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); + Offset = Ptr.getConstantOperandVal(1); + Ptr = Ptr.getOperand(0); + } else { + return SDValue(); + } + + // FIXME: 256-bit vector instructions don't require a strict alignment, + // improve this code to support it better. + unsigned RequiredAlign = VT.getSizeInBits()/8; + SDValue Chain = LD->getChain(); + // Make sure the stack object alignment is at least 16 or 32. + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { + if (MFI->isFixedObjectIndex(FI)) { + // Can't change the alignment. FIXME: It's possible to compute + // the exact stack offset and reference FI + adjust offset instead. + // If someone *really* cares about this. That's the way to implement it. + return SDValue(); + } else { + MFI->setObjectAlignment(FI, RequiredAlign); + } + } + + // (Offset % 16 or 32) must be multiple of 4. Then address is then + // Ptr + (Offset & ~15). + if (Offset < 0) + return SDValue(); + if ((Offset % RequiredAlign) & 3) + return SDValue(); + int64_t StartOffset = Offset & ~(RequiredAlign-1); + if (StartOffset) + Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), + Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); + + int EltNo = (Offset - StartOffset) >> 2; + unsigned NumElems = VT.getVectorNumElements(); + + EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); + SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, + LD->getPointerInfo().getWithOffset(StartOffset), + false, false, false, 0); + + SmallVector<int, 8> Mask; + for (unsigned i = 0; i != NumElems; ++i) + Mask.push_back(EltNo); + + return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); + } + + return SDValue(); +} + +/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a +/// vector of type 'VT', see if the elements can be replaced by a single large +/// load which has the same value as a build_vector whose operands are 'elts'. +/// +/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a +/// +/// FIXME: we'd also like to handle the case where the last elements are zero +/// rather than undef via VZEXT_LOAD, but we do not detect that case today. +/// There's even a handy isZeroNode for that purpose. +static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, + DebugLoc &DL, SelectionDAG &DAG) { + EVT EltVT = VT.getVectorElementType(); + unsigned NumElems = Elts.size(); + + LoadSDNode *LDBase = NULL; + unsigned LastLoadedElt = -1U; + + // For each element in the initializer, see if we've found a load or an undef. + // If we don't find an initial load element, or later load elements are + // non-consecutive, bail out. + for (unsigned i = 0; i < NumElems; ++i) { + SDValue Elt = Elts[i]; + + if (!Elt.getNode() || + (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) + return SDValue(); + if (!LDBase) { + if (Elt.getNode()->getOpcode() == ISD::UNDEF) + return SDValue(); + LDBase = cast<LoadSDNode>(Elt.getNode()); + LastLoadedElt = i; + continue; + } + if (Elt.getOpcode() == ISD::UNDEF) + continue; + + LoadSDNode *LD = cast<LoadSDNode>(Elt); + if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) + return SDValue(); + LastLoadedElt = i; + } + + // If we have found an entire vector of loads and undefs, then return a large + // load of the entire vector width starting at the base pointer. If we found + // consecutive loads for the low half, generate a vzext_load node. + if (LastLoadedElt == NumElems - 1) { + if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) + return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), + LDBase->getPointerInfo(), + LDBase->isVolatile(), LDBase->isNonTemporal(), + LDBase->isInvariant(), 0); + return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), + LDBase->getPointerInfo(), + LDBase->isVolatile(), LDBase->isNonTemporal(), + LDBase->isInvariant(), LDBase->getAlignment()); + } + if (NumElems == 4 && LastLoadedElt == 1 && + DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { + SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); + SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; + SDValue ResNode = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, + array_lengthof(Ops), MVT::i64, + LDBase->getPointerInfo(), + LDBase->getAlignment(), + false/*isVolatile*/, true/*ReadMem*/, + false/*WriteMem*/); + + // Make sure the newly-created LOAD is in the same position as LDBase in + // terms of dependency. We create a TokenFactor for LDBase and ResNode, and + // update uses of LDBase's output chain to use the TokenFactor. + if (LDBase->hasAnyUseOfValue(1)) { + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); + DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), + SDValue(ResNode.getNode(), 1)); + } + + return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); + } + return SDValue(); +} + +/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction +/// to generate a splat value for the following cases: +/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. +/// 2. A splat shuffle which uses a scalar_to_vector node which comes from +/// a scalar load, or a constant. +/// The VBROADCAST node is returned when a pattern is found, +/// or SDValue() otherwise. +SDValue +X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { + if (!Subtarget->hasFp256()) + return SDValue(); + + MVT VT = Op.getValueType().getSimpleVT(); + DebugLoc dl = Op.getDebugLoc(); + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for broadcast."); + + SDValue Ld; + bool ConstSplatVal; + + switch (Op.getOpcode()) { + default: + // Unknown pattern found. + return SDValue(); + + case ISD::BUILD_VECTOR: { + // The BUILD_VECTOR node must be a splat. + if (!isSplatVector(Op.getNode())) + return SDValue(); + + Ld = Op.getOperand(0); + ConstSplatVal = (Ld.getOpcode() == ISD::Constant || + Ld.getOpcode() == ISD::ConstantFP); + + // The suspected load node has several users. Make sure that all + // of its users are from the BUILD_VECTOR node. + // Constants may have multiple users. + if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) + return SDValue(); + break; + } + + case ISD::VECTOR_SHUFFLE: { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + + // Shuffles must have a splat mask where the first element is + // broadcasted. + if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) + return SDValue(); + + SDValue Sc = Op.getOperand(0); + if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && + Sc.getOpcode() != ISD::BUILD_VECTOR) { + + if (!Subtarget->hasInt256()) + return SDValue(); + + // Use the register form of the broadcast instruction available on AVX2. + if (VT.is256BitVector()) + Sc = Extract128BitVector(Sc, 0, DAG, dl); + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); + } + + Ld = Sc.getOperand(0); + ConstSplatVal = (Ld.getOpcode() == ISD::Constant || + Ld.getOpcode() == ISD::ConstantFP); + + // The scalar_to_vector node and the suspected + // load node must have exactly one user. + // Constants may have multiple users. + if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse())) + return SDValue(); + break; + } + } + + bool Is256 = VT.is256BitVector(); + + // Handle the broadcasting a single constant scalar from the constant pool + // into a vector. On Sandybridge it is still better to load a constant vector + // from the constant pool and not to broadcast it from a scalar. + if (ConstSplatVal && Subtarget->hasInt256()) { + EVT CVT = Ld.getValueType(); + assert(!CVT.isVector() && "Must not broadcast a vector type"); + unsigned ScalarSize = CVT.getSizeInBits(); + + if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) { + const Constant *C = 0; + if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) + C = CI->getConstantIntValue(); + else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld)) + C = CF->getConstantFPValue(); + + assert(C && "Invalid constant type"); + + SDValue CP = DAG.getConstantPool(C, getPointerTy()); + unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); + Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, + MachinePointerInfo::getConstantPool(), + false, false, false, Alignment); + + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + } + } + + bool IsLoad = ISD::isNormalLoad(Ld.getNode()); + unsigned ScalarSize = Ld.getValueType().getSizeInBits(); + + // Handle AVX2 in-register broadcasts. + if (!IsLoad && Subtarget->hasInt256() && + (ScalarSize == 32 || (Is256 && ScalarSize == 64))) + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + + // The scalar source must be a normal load. + if (!IsLoad) + return SDValue(); + + if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + + // The integer check is needed for the 64-bit into 128-bit so it doesn't match + // double since there is no vbroadcastsd xmm + if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) { + if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + } + + // Unsupported broadcast. + return SDValue(); +} + +SDValue +X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + // Skip if insert_vec_elt is not supported. + if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) + return SDValue(); + + DebugLoc DL = Op.getDebugLoc(); + unsigned NumElems = Op.getNumOperands(); + + SDValue VecIn1; + SDValue VecIn2; + SmallVector<unsigned, 4> InsertIndices; + SmallVector<int, 8> Mask(NumElems, -1); + + for (unsigned i = 0; i != NumElems; ++i) { + unsigned Opc = Op.getOperand(i).getOpcode(); + + if (Opc == ISD::UNDEF) + continue; + + if (Opc != ISD::EXTRACT_VECTOR_ELT) { + // Quit if more than 1 elements need inserting. + if (InsertIndices.size() > 1) + return SDValue(); + + InsertIndices.push_back(i); + continue; + } + + SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); + SDValue ExtIdx = Op.getOperand(i).getOperand(1); + + // Quit if extracted from vector of different type. + if (ExtractedFromVec.getValueType() != VT) + return SDValue(); + + // Quit if non-constant index. + if (!isa<ConstantSDNode>(ExtIdx)) + return SDValue(); + + if (VecIn1.getNode() == 0) + VecIn1 = ExtractedFromVec; + else if (VecIn1 != ExtractedFromVec) { + if (VecIn2.getNode() == 0) + VecIn2 = ExtractedFromVec; + else if (VecIn2 != ExtractedFromVec) + // Quit if more than 2 vectors to shuffle + return SDValue(); + } + + unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue(); + + if (ExtractedFromVec == VecIn1) + Mask[i] = Idx; + else if (ExtractedFromVec == VecIn2) + Mask[i] = Idx + NumElems; + } + + if (VecIn1.getNode() == 0) + return SDValue(); + + VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); + SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]); + for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) { + unsigned Idx = InsertIndices[i]; + NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), + DAG.getIntPtrConstant(Idx)); + } + + return NV; +} + +SDValue +X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { + DebugLoc dl = Op.getDebugLoc(); + + MVT VT = Op.getValueType().getSimpleVT(); + MVT ExtVT = VT.getVectorElementType(); + unsigned NumElems = Op.getNumOperands(); + + // Vectors containing all zeros can be matched by pxor and xorps later + if (ISD::isBuildVectorAllZeros(Op.getNode())) { + // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd + // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. + if (VT == MVT::v4i32 || VT == MVT::v8i32) + return Op; + + return getZeroVector(VT, Subtarget, DAG, dl); + } + + // Vectors containing all ones can be matched by pcmpeqd on 128-bit width + // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use + // vpcmpeqd on 256-bit vectors. + if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { + if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256())) + return Op; + + return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); + } + + SDValue Broadcast = LowerVectorBroadcast(Op, DAG); + if (Broadcast.getNode()) + return Broadcast; + + unsigned EVTBits = ExtVT.getSizeInBits(); + + unsigned NumZero = 0; + unsigned NumNonZero = 0; + unsigned NonZeros = 0; + bool IsAllConstants = true; + SmallSet<SDValue, 8> Values; + for (unsigned i = 0; i < NumElems; ++i) { + SDValue Elt = Op.getOperand(i); + if (Elt.getOpcode() == ISD::UNDEF) + continue; + Values.insert(Elt); + if (Elt.getOpcode() != ISD::Constant && + Elt.getOpcode() != ISD::ConstantFP) + IsAllConstants = false; + if (X86::isZeroNode(Elt)) + NumZero++; + else { + NonZeros |= (1 << i); + NumNonZero++; + } + } + + // All undef vector. Return an UNDEF. All zero vectors were handled above. + if (NumNonZero == 0) + return DAG.getUNDEF(VT); + + // Special case for single non-zero, non-undef, element. + if (NumNonZero == 1) { + unsigned Idx = CountTrailingZeros_32(NonZeros); + SDValue Item = Op.getOperand(Idx); + + // If this is an insertion of an i64 value on x86-32, and if the top bits of + // the value are obviously zero, truncate the value to i32 and do the + // insertion that way. Only do this if the value is non-constant or if the + // value is a constant being inserted into element 0. It is cheaper to do + // a constant pool load than it is to do a movd + shuffle. + if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && + (!IsAllConstants || Idx == 0)) { + if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { + // Handle SSE only. + assert(VT == MVT::v2i64 && "Expected an SSE value type!"); + EVT VecVT = MVT::v4i32; + unsigned VecElts = 4; + + // Truncate the value (which may itself be a constant) to i32, and + // convert it to a vector with movd (S2V+shuffle to zero extend). + Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); + Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); + + // Now we have our 32-bit value zero extended in the low element of + // a vector. If Idx != 0, swizzle it into place. + if (Idx != 0) { + SmallVector<int, 4> Mask; + Mask.push_back(Idx); + for (unsigned i = 1; i != VecElts; ++i) + Mask.push_back(i); + Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT), + &Mask[0]); + } + return DAG.getNode(ISD::BITCAST, dl, VT, Item); + } + } + + // If we have a constant or non-constant insertion into the low element of + // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into + // the rest of the elements. This will be matched as movd/movq/movss/movsd + // depending on what the source datatype is. + if (Idx == 0) { + if (NumZero == 0) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); + + if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || + (ExtVT == MVT::i64 && Subtarget->is64Bit())) { + if (VT.is256BitVector()) { + SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, + Item, DAG.getIntPtrConstant(0)); + } + assert(VT.is128BitVector() && "Expected an SSE value type!"); + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); + // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. + return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); + } + + if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { + Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); + if (VT.is256BitVector()) { + SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); + Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); + } else { + assert(VT.is128BitVector() && "Expected an SSE value type!"); + Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); + } + return DAG.getNode(ISD::BITCAST, dl, VT, Item); + } + } + + // Is it a vector logical left shift? + if (NumElems == 2 && Idx == 1 && + X86::isZeroNode(Op.getOperand(0)) && + !X86::isZeroNode(Op.getOperand(1))) { + unsigned NumBits = VT.getSizeInBits(); + return getVShift(true, VT, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, + VT, Op.getOperand(1)), + NumBits/2, DAG, *this, dl); + } + + if (IsAllConstants) // Otherwise, it's better to do a constpool load. + return SDValue(); + + // Otherwise, if this is a vector with i32 or f32 elements, and the element + // is a non-constant being inserted into an element other than the low one, + // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka + // movd/movss) to move this into the low element, then shuffle it into + // place. + if (EVTBits == 32) { + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); + + // Turn it into a shuffle of zero and zero-extended scalar to vector. + Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG); + SmallVector<int, 8> MaskVec; + for (unsigned i = 0; i != NumElems; ++i) + MaskVec.push_back(i == Idx ? 0 : 1); + return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); + } + } + + // Splat is obviously ok. Let legalizer expand it to a shuffle. + if (Values.size() == 1) { + if (EVTBits == 32) { + // Instead of a shuffle like this: + // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> + // Check if it's possible to issue this instead. + // shuffle (vload ptr)), undef, <1, 1, 1, 1> + unsigned Idx = CountTrailingZeros_32(NonZeros); + SDValue Item = Op.getOperand(Idx); + if (Op.getNode()->isOnlyUserOf(Item.getNode())) + return LowerAsSplatVectorLoad(Item, VT, dl, DAG); + } + return SDValue(); + } + + // A vector full of immediates; various special cases are already + // handled, so this is best done with a single constant-pool load. + if (IsAllConstants) + return SDValue(); + + // For AVX-length vectors, build the individual 128-bit pieces and use + // shuffles to put them in place. + if (VT.is256BitVector()) { + SmallVector<SDValue, 32> V; + for (unsigned i = 0; i != NumElems; ++i) + V.push_back(Op.getOperand(i)); + + EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); + + // Build both the lower and upper subvector. + SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2); + SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2], + NumElems/2); + + // Recreate the wider vector with the lower and upper part. + return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); + } + + // Let legalizer expand 2-wide build_vectors. + if (EVTBits == 64) { + if (NumNonZero == 1) { + // One half is zero or undef. + unsigned Idx = CountTrailingZeros_32(NonZeros); + SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, + Op.getOperand(Idx)); + return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); + } + return SDValue(); + } + + // If element VT is < 32 bits, convert it to inserts into a zero vector. + if (EVTBits == 8 && NumElems == 16) { + SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, + Subtarget, *this); + if (V.getNode()) return V; + } + + if (EVTBits == 16 && NumElems == 8) { + SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, + Subtarget, *this); + if (V.getNode()) return V; + } + + // If element VT is == 32 bits, turn it into a number of shuffles. + SmallVector<SDValue, 8> V(NumElems); + if (NumElems == 4 && NumZero > 0) { + for (unsigned i = 0; i < 4; ++i) { + bool isZero = !(NonZeros & (1 << i)); + if (isZero) + V[i] = getZeroVector(VT, Subtarget, DAG, dl); + else + V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); + } + + for (unsigned i = 0; i < 2; ++i) { + switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { + default: break; + case 0: + V[i] = V[i*2]; // Must be a zero vector. + break; + case 1: + V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); + break; + case 2: + V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); + break; + case 3: + V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); + break; + } + } + + bool Reverse1 = (NonZeros & 0x3) == 2; + bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; + int MaskVec[] = { + Reverse1 ? 1 : 0, + Reverse1 ? 0 : 1, + static_cast<int>(Reverse2 ? NumElems+1 : NumElems), + static_cast<int>(Reverse2 ? NumElems : NumElems+1) + }; + return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); + } + + if (Values.size() > 1 && VT.is128BitVector()) { + // Check for a build vector of consecutive loads. + for (unsigned i = 0; i < NumElems; ++i) + V[i] = Op.getOperand(i); + + // Check for elements which are consecutive loads. + SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); + if (LD.getNode()) + return LD; + + // Check for a build vector from mostly shuffle plus few inserting. + SDValue Sh = buildFromShuffleMostly(Op, DAG); + if (Sh.getNode()) + return Sh; + + // For SSE 4.1, use insertps to put the high elements into the low element. + if (getSubtarget()->hasSSE41()) { + SDValue Result; + if (Op.getOperand(0).getOpcode() != ISD::UNDEF) + Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); + else + Result = DAG.getUNDEF(VT); + + for (unsigned i = 1; i < NumElems; ++i) { + if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; + Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, + Op.getOperand(i), DAG.getIntPtrConstant(i)); + } + return Result; + } + + // Otherwise, expand into a number of unpckl*, start by extending each of + // our (non-undef) elements to the full vector width with the element in the + // bottom slot of the vector (which generates no code for SSE). + for (unsigned i = 0; i < NumElems; ++i) { + if (Op.getOperand(i).getOpcode() != ISD::UNDEF) + V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); + else + V[i] = DAG.getUNDEF(VT); + } + + // Next, we iteratively mix elements, e.g. for v4f32: + // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> + // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> + // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> + unsigned EltStride = NumElems >> 1; + while (EltStride != 0) { + for (unsigned i = 0; i < EltStride; ++i) { + // If V[i+EltStride] is undef and this is the first round of mixing, + // then it is safe to just drop this shuffle: V[i] is already in the + // right place, the one element (since it's the first round) being + // inserted as undef can be dropped. This isn't safe for successive + // rounds because they will permute elements within both vectors. + if (V[i+EltStride].getOpcode() == ISD::UNDEF && + EltStride == NumElems/2) + continue; + + V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); + } + EltStride >>= 1; + } + return V[0]; + } + return SDValue(); +} + +// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction +// to create 256-bit vectors from two other 128-bit ones. +static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { + DebugLoc dl = Op.getDebugLoc(); + MVT ResVT = Op.getValueType().getSimpleVT(); + + assert(ResVT.is256BitVector() && "Value type must be 256-bit wide"); + + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + unsigned NumElems = ResVT.getVectorNumElements(); + + return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); +} + +static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { + assert(Op.getNumOperands() == 2); + + // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors + // from two other 128-bit ones. + return LowerAVXCONCAT_VECTORS(Op, DAG); +} + +// Try to lower a shuffle node into a simple blend instruction. +static SDValue +LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + DebugLoc dl = SVOp->getDebugLoc(); + MVT VT = SVOp->getValueType(0).getSimpleVT(); + MVT EltVT = VT.getVectorElementType(); + unsigned NumElems = VT.getVectorNumElements(); + + if (!Subtarget->hasSSE41() || EltVT == MVT::i8) + return SDValue(); + if (!Subtarget->hasInt256() && VT == MVT::v16i16) + return SDValue(); + + // Check the mask for BLEND and build the value. + unsigned MaskValue = 0; + // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. + unsigned NumLanes = (NumElems-1)/8 + 1; + unsigned NumElemsInLane = NumElems / NumLanes; + + // Blend for v16i16 should be symetric for the both lanes. + for (unsigned i = 0; i < NumElemsInLane; ++i) { + + int SndLaneEltIdx = (NumLanes == 2) ? + SVOp->getMaskElt(i + NumElemsInLane) : -1; + int EltIdx = SVOp->getMaskElt(i); + + if ((EltIdx < 0 || EltIdx == (int)i) && + (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane))) + continue; + + if (((unsigned)EltIdx == (i + NumElems)) && + (SndLaneEltIdx < 0 || + (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane)) + MaskValue |= (1<<i); + else + return SDValue(); + } + + // Convert i32 vectors to floating point if it is not AVX2. + // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. + MVT BlendVT = VT; + if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { + BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), + NumElems); + V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2); + } + + SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2, + DAG.getConstant(MaskValue, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Ret); +} + +// v8i16 shuffles - Prefer shuffles in the following order: +// 1. [all] pshuflw, pshufhw, optional move +// 2. [ssse3] 1 x pshufb +// 3. [ssse3] 2 x pshufb + 1 x por +// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) +static SDValue +LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + DebugLoc dl = SVOp->getDebugLoc(); + SmallVector<int, 8> MaskVals; + + // Determine if more than 1 of the words in each of the low and high quadwords + // of the result come from the same quadword of one of the two inputs. Undef + // mask values count as coming from any quadword, for better codegen. + unsigned LoQuad[] = { 0, 0, 0, 0 }; + unsigned HiQuad[] = { 0, 0, 0, 0 }; + std::bitset<4> InputQuads; + for (unsigned i = 0; i < 8; ++i) { + unsigned *Quad = i < 4 ? LoQuad : HiQuad; + int EltIdx = SVOp->getMaskElt(i); + MaskVals.push_back(EltIdx); + if (EltIdx < 0) { + ++Quad[0]; + ++Quad[1]; + ++Quad[2]; + ++Quad[3]; + continue; + } + ++Quad[EltIdx / 4]; + InputQuads.set(EltIdx / 4); + } + + int BestLoQuad = -1; + unsigned MaxQuad = 1; + for (unsigned i = 0; i < 4; ++i) { + if (LoQuad[i] > MaxQuad) { + BestLoQuad = i; + MaxQuad = LoQuad[i]; + } + } + + int BestHiQuad = -1; + MaxQuad = 1; + for (unsigned i = 0; i < 4; ++i) { + if (HiQuad[i] > MaxQuad) { + BestHiQuad = i; + MaxQuad = HiQuad[i]; + } + } + + // For SSSE3, If all 8 words of the result come from only 1 quadword of each + // of the two input vectors, shuffle them into one input vector so only a + // single pshufb instruction is necessary. If There are more than 2 input + // quads, disable the next transformation since it does not help SSSE3. + bool V1Used = InputQuads[0] || InputQuads[1]; + bool V2Used = InputQuads[2] || InputQuads[3]; + if (Subtarget->hasSSSE3()) { + if (InputQuads.count() == 2 && V1Used && V2Used) { + BestLoQuad = InputQuads[0] ? 0 : 1; + BestHiQuad = InputQuads[2] ? 2 : 3; + } + if (InputQuads.count() > 2) { + BestLoQuad = -1; + BestHiQuad = -1; + } + } + + // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update + // the shuffle mask. If a quad is scored as -1, that means that it contains + // words from all 4 input quadwords. + SDValue NewV; + if (BestLoQuad >= 0 || BestHiQuad >= 0) { + int MaskV[] = { + BestLoQuad < 0 ? 0 : BestLoQuad, + BestHiQuad < 0 ? 1 : BestHiQuad + }; + NewV = DAG.getVectorShuffle(MVT::v2i64, dl, + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); + NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); + + // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the + // source words for the shuffle, to aid later transformations. + bool AllWordsInNewV = true; + bool InOrder[2] = { true, true }; + for (unsigned i = 0; i != 8; ++i) { + int idx = MaskVals[i]; + if (idx != (int)i) + InOrder[i/4] = false; + if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) + continue; + AllWordsInNewV = false; + break; + } + + bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; + if (AllWordsInNewV) { + for (int i = 0; i != 8; ++i) { + int idx = MaskVals[i]; + if (idx < 0) + continue; + idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; + if ((idx != i) && idx < 4) + pshufhw = false; + if ((idx != i) && idx > 3) + pshuflw = false; + } + V1 = NewV; + V2Used = false; + BestLoQuad = 0; + BestHiQuad = 1; + } + + // If we've eliminated the use of V2, and the new mask is a pshuflw or + // pshufhw, that's as cheap as it gets. Return the new shuffle. + if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { + unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; + unsigned TargetMask = 0; + NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, + DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); + TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp): + getShufflePSHUFLWImmediate(SVOp); + V1 = NewV.getOperand(0); + return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); + } + } + + // Promote splats to a larger type which usually leads to more efficient code. + // FIXME: Is this true if pshufb is available? + if (SVOp->isSplat()) + return PromoteSplat(SVOp, DAG); + + // If we have SSSE3, and all words of the result are from 1 input vector, + // case 2 is generated, otherwise case 3 is generated. If no SSSE3 + // is present, fall back to case 4. + if (Subtarget->hasSSSE3()) { + SmallVector<SDValue,16> pshufbMask; + + // If we have elements from both input vectors, set the high bit of the + // shuffle mask element to zero out elements that come from V2 in the V1 + // mask, and elements that come from V1 in the V2 mask, so that the two + // results can be OR'd together. + bool TwoInputs = V1Used && V2Used; + for (unsigned i = 0; i != 8; ++i) { + int EltIdx = MaskVals[i] * 2; + int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx; + int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1; + pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8)); + } + V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); + V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, dl, + MVT::v16i8, &pshufbMask[0], 16)); + if (!TwoInputs) + return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); + + // Calculate the shuffle mask for the second input, shuffle it, and + // OR it with the first shuffled input. + pshufbMask.clear(); + for (unsigned i = 0; i != 8; ++i) { + int EltIdx = MaskVals[i] * 2; + int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16; + int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15; + pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8)); + } + V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); + V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, + DAG.getNode(ISD::BUILD_VECTOR, dl, + MVT::v16i8, &pshufbMask[0], 16)); + V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); + return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); + } + + // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, + // and update MaskVals with new element order. + std::bitset<8> InOrder; + if (BestLoQuad >= 0) { + int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 }; + for (int i = 0; i != 4; ++i) { + int idx = MaskVals[i]; + if (idx < 0) { + InOrder.set(i); + } else if ((idx / 4) == BestLoQuad) { + MaskV[i] = idx & 3; + InOrder.set(i); + } + } + NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), + &MaskV[0]); + + if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); + NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, + NewV.getOperand(0), + getShufflePSHUFLWImmediate(SVOp), DAG); + } + } + + // If BestHi >= 0, generate a pshufhw to put the high elements in order, + // and update MaskVals with the new element order. + if (BestHiQuad >= 0) { + int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 }; + for (unsigned i = 4; i != 8; ++i) { + int idx = MaskVals[i]; + if (idx < 0) { + InOrder.set(i); + } else if ((idx / 4) == BestHiQuad) { + MaskV[i] = (idx & 3) + 4; + InOrder.set(i); + } + } + NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), + &MaskV[0]); + + if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); + NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, + NewV.getOperand(0), + getShufflePSHUFHWImmediate(SVOp), DAG); + } + } + + // In case BestHi & BestLo were both -1, which means each quadword has a word + // from each of the four input quadwords, calculate the InOrder bitvector now + // before falling through to the insert/extract cleanup. + if (BestLoQuad == -1 && BestHiQuad == -1) { + NewV = V1; + for (int i = 0; i != 8; ++i) + if (MaskVals[i] < 0 || MaskVals[i] == i) + InOrder.set(i); + } + + // The other elements are put in the right place using pextrw and pinsrw. + for (unsigned i = 0; i != 8; ++i) { + if (InOrder[i]) + continue; + int EltIdx = MaskVals[i]; + if (EltIdx < 0) + continue; + SDValue ExtOp = (EltIdx < 8) ? + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, + DAG.getIntPtrConstant(EltIdx)) : + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, + DAG.getIntPtrConstant(EltIdx - 8)); + NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, + DAG.getIntPtrConstant(i)); + } + return NewV; +} + +// v16i8 shuffles - Prefer shuffles in the following order: +// 1. [ssse3] 1 x pshufb +// 2. [ssse3] 2 x pshufb + 1 x por +// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw +static +SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, + SelectionDAG &DAG, + const X86TargetLowering &TLI) { + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + DebugLoc dl = SVOp->getDebugLoc(); + ArrayRef<int> MaskVals = SVOp->getMask(); + + // Promote splats to a larger type which usually leads to more efficient code. + // FIXME: Is this true if pshufb is available? + if (SVOp->isSplat()) + return PromoteSplat(SVOp, DAG); + + // If we have SSSE3, case 1 is generated when all result bytes come from + // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is + // present, fall back to case 3. + + // If SSSE3, use 1 pshufb instruction per vector with elements in the result. + if (TLI.getSubtarget()->hasSSSE3()) { + SmallVector<SDValue,16> pshufbMask; + + // If all result elements are from one input vector, then only translate + // undef mask values to 0x80 (zero out result) in the pshufb mask. + // + // Otherwise, we have elements from both input vectors, and must zero out + // elements that come from V2 in the first mask, and V1 in the second mask + // so that we can OR them together. + for (unsigned i = 0; i != 16; ++i) { + int EltIdx = MaskVals[i]; + if (EltIdx < 0 || EltIdx >= 16) + EltIdx = 0x80; + pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); + } + V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, dl, + MVT::v16i8, &pshufbMask[0], 16)); + + // As PSHUFB will zero elements with negative indices, it's safe to ignore + // the 2nd operand if it's undefined or zero. + if (V2.getOpcode() == ISD::UNDEF || + ISD::isBuildVectorAllZeros(V2.getNode())) + return V1; + + // Calculate the shuffle mask for the second input, shuffle it, and + // OR it with the first shuffled input. + pshufbMask.clear(); + for (unsigned i = 0; i != 16; ++i) { + int EltIdx = MaskVals[i]; + EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16; + pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); + } + V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, + DAG.getNode(ISD::BUILD_VECTOR, dl, + MVT::v16i8, &pshufbMask[0], 16)); + return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); + } + + // No SSSE3 - Calculate in place words and then fix all out of place words + // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from + // the 16 different words that comprise the two doublequadword input vectors. + V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); + SDValue NewV = V1; + for (int i = 0; i != 8; ++i) { + int Elt0 = MaskVals[i*2]; + int Elt1 = MaskVals[i*2+1]; + + // This word of the result is all undef, skip it. + if (Elt0 < 0 && Elt1 < 0) + continue; + + // This word of the result is already in the correct place, skip it. + if ((Elt0 == i*2) && (Elt1 == i*2+1)) + continue; + + SDValue Elt0Src = Elt0 < 16 ? V1 : V2; + SDValue Elt1Src = Elt1 < 16 ? V1 : V2; + SDValue InsElt; + + // If Elt0 and Elt1 are defined, are consecutive, and can be load + // using a single extract together, load it and store it. + if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { + InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, + DAG.getIntPtrConstant(Elt1 / 2)); + NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, + DAG.getIntPtrConstant(i)); + continue; + } + + // If Elt1 is defined, extract it from the appropriate source. If the + // source byte is not also odd, shift the extracted word left 8 bits + // otherwise clear the bottom 8 bits if we need to do an or. + if (Elt1 >= 0) { + InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, + DAG.getIntPtrConstant(Elt1 / 2)); + if ((Elt1 & 1) == 0) + InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, + DAG.getConstant(8, + TLI.getShiftAmountTy(InsElt.getValueType()))); + else if (Elt0 >= 0) + InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, + DAG.getConstant(0xFF00, MVT::i16)); + } + // If Elt0 is defined, extract it from the appropriate source. If the + // source byte is not also even, shift the extracted word right 8 bits. If + // Elt1 was also defined, OR the extracted values together before + // inserting them in the result. + if (Elt0 >= 0) { + SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, + Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); + if ((Elt0 & 1) != 0) + InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, + DAG.getConstant(8, + TLI.getShiftAmountTy(InsElt0.getValueType()))); + else if (Elt1 >= 0) + InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, + DAG.getConstant(0x00FF, MVT::i16)); + InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) + : InsElt0; + } + NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, + DAG.getIntPtrConstant(i)); + } + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); +} + +// v32i8 shuffles - Translate to VPSHUFB if possible. +static +SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = SVOp->getValueType(0).getSimpleVT(); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + DebugLoc dl = SVOp->getDebugLoc(); + SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); + + bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; + bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode()); + + // VPSHUFB may be generated if + // (1) one of input vector is undefined or zeroinitializer. + // The mask value 0x80 puts 0 in the corresponding slot of the vector. + // And (2) the mask indexes don't cross the 128-bit lane. + if (VT != MVT::v32i8 || !Subtarget->hasInt256() || + (!V2IsUndef && !V2IsAllZero && !V1IsAllZero)) + return SDValue(); + + if (V1IsAllZero && !V2IsAllZero) { + CommuteVectorShuffleMask(MaskVals, 32); + V1 = V2; + } + SmallVector<SDValue, 32> pshufbMask; + for (unsigned i = 0; i != 32; i++) { + int EltIdx = MaskVals[i]; + if (EltIdx < 0 || EltIdx >= 32) + EltIdx = 0x80; + else { + if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16)) + // Cross lane is not allowed. + return SDValue(); + EltIdx &= 0xf; + } + pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); + } + return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, dl, + MVT::v32i8, &pshufbMask[0], 32)); +} + +/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide +/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be +/// done when every pair / quad of shuffle mask elements point to elements in +/// the right sequence. e.g. +/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> +static +SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, + SelectionDAG &DAG) { + MVT VT = SVOp->getValueType(0).getSimpleVT(); + DebugLoc dl = SVOp->getDebugLoc(); + unsigned NumElems = VT.getVectorNumElements(); + MVT NewVT; + unsigned Scale; + switch (VT.SimpleTy) { + default: llvm_unreachable("Unexpected!"); + case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break; + case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break; + case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break; + case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break; + case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break; + case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break; + } + + SmallVector<int, 8> MaskVec; + for (unsigned i = 0; i != NumElems; i += Scale) { + int StartIdx = -1; + for (unsigned j = 0; j != Scale; ++j) { + int EltIdx = SVOp->getMaskElt(i+j); + if (EltIdx < 0) + continue; + if (StartIdx < 0) + StartIdx = (EltIdx / Scale); + if (EltIdx != (int)(StartIdx*Scale + j)) + return SDValue(); + } + MaskVec.push_back(StartIdx); + } + + SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0)); + SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1)); + return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); +} + +/// getVZextMovL - Return a zero-extending vector move low node. +/// +static SDValue getVZextMovL(MVT VT, EVT OpVT, + SDValue SrcOp, SelectionDAG &DAG, + const X86Subtarget *Subtarget, DebugLoc dl) { + if (VT == MVT::v2f64 || VT == MVT::v4f32) { + LoadSDNode *LD = NULL; + if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) + LD = dyn_cast<LoadSDNode>(SrcOp); + if (!LD) { + // movssrr and movsdrr do not clear top bits. Try to use movd, movq + // instead. + MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; + if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && + SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && + SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && + SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { + // PR2108 + OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, + OpVT, + SrcOp.getOperand(0) + .getOperand(0)))); + } + } + } + + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, + DAG.getNode(ISD::BITCAST, dl, + OpVT, SrcOp))); +} + +/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles +/// which could not be matched by any known target speficic shuffle +static SDValue +LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { + + SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG); + if (NewOp.getNode()) + return NewOp; + + MVT VT = SVOp->getValueType(0).getSimpleVT(); + + unsigned NumElems = VT.getVectorNumElements(); + unsigned NumLaneElems = NumElems / 2; + + DebugLoc dl = SVOp->getDebugLoc(); + MVT EltVT = VT.getVectorElementType(); + MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); + SDValue Output[2]; + + SmallVector<int, 16> Mask; + for (unsigned l = 0; l < 2; ++l) { + // Build a shuffle mask for the output, discovering on the fly which + // input vectors to use as shuffle operands (recorded in InputUsed). + // If building a suitable shuffle vector proves too hard, then bail + // out with UseBuildVector set. + bool UseBuildVector = false; + int InputUsed[2] = { -1, -1 }; // Not yet discovered. + unsigned LaneStart = l * NumLaneElems; + for (unsigned i = 0; i != NumLaneElems; ++i) { + // The mask element. This indexes into the input. + int Idx = SVOp->getMaskElt(i+LaneStart); + if (Idx < 0) { + // the mask element does not index into any input vector. + Mask.push_back(-1); + continue; + } + + // The input vector this mask element indexes into. + int Input = Idx / NumLaneElems; + + // Turn the index into an offset from the start of the input vector. + Idx -= Input * NumLaneElems; + + // Find or create a shuffle vector operand to hold this input. + unsigned OpNo; + for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) { + if (InputUsed[OpNo] == Input) + // This input vector is already an operand. + break; + if (InputUsed[OpNo] < 0) { + // Create a new operand for this input vector. + InputUsed[OpNo] = Input; + break; + } + } + + if (OpNo >= array_lengthof(InputUsed)) { + // More than two input vectors used! Give up on trying to create a + // shuffle vector. Insert all elements into a BUILD_VECTOR instead. + UseBuildVector = true; + break; + } + + // Add the mask index for the new shuffle vector. + Mask.push_back(Idx + OpNo * NumLaneElems); + } + + if (UseBuildVector) { + SmallVector<SDValue, 16> SVOps; + for (unsigned i = 0; i != NumLaneElems; ++i) { + // The mask element. This indexes into the input. + int Idx = SVOp->getMaskElt(i+LaneStart); + if (Idx < 0) { + SVOps.push_back(DAG.getUNDEF(EltVT)); + continue; + } + + // The input vector this mask element indexes into. + int Input = Idx / NumElems; + + // Turn the index into an offset from the start of the input vector. + Idx -= Input * NumElems; + + // Extract the vector element by hand. + SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, + SVOp->getOperand(Input), + DAG.getIntPtrConstant(Idx))); + } + + // Construct the output using a BUILD_VECTOR. + Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0], + SVOps.size()); + } else if (InputUsed[0] < 0) { + // No input vectors were used! The result is undefined. + Output[l] = DAG.getUNDEF(NVT); + } else { + SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2), + (InputUsed[0] % 2) * NumLaneElems, + DAG, dl); + // If only one input was used, use an undefined vector for the other. + SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) : + Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2), + (InputUsed[1] % 2) * NumLaneElems, DAG, dl); + // At least one input vector was used. Create a new shuffle vector. + Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]); + } + + Mask.clear(); + } + + // Concatenate the result back + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]); +} + +/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with +/// 4 elements, and match them with several different shuffle types. +static SDValue +LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + DebugLoc dl = SVOp->getDebugLoc(); + MVT VT = SVOp->getValueType(0).getSimpleVT(); + + assert(VT.is128BitVector() && "Unsupported vector size"); + + std::pair<int, int> Locs[4]; + int Mask1[] = { -1, -1, -1, -1 }; + SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end()); + + unsigned NumHi = 0; + unsigned NumLo = 0; + for (unsigned i = 0; i != 4; ++i) { + int Idx = PermMask[i]; + if (Idx < 0) { + Locs[i] = std::make_pair(-1, -1); + } else { + assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); + if (Idx < 4) { + Locs[i] = std::make_pair(0, NumLo); + Mask1[NumLo] = Idx; + NumLo++; + } else { + Locs[i] = std::make_pair(1, NumHi); + if (2+NumHi < 4) + Mask1[2+NumHi] = Idx; + NumHi++; + } + } + } + + if (NumLo <= 2 && NumHi <= 2) { + // If no more than two elements come from either vector. This can be + // implemented with two shuffles. First shuffle gather the elements. + // The second shuffle, which takes the first shuffle as both of its + // vector operands, put the elements into the right order. + V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); + + int Mask2[] = { -1, -1, -1, -1 }; + + for (unsigned i = 0; i != 4; ++i) + if (Locs[i].first != -1) { + unsigned Idx = (i < 2) ? 0 : 4; + Idx += Locs[i].first * 2 + Locs[i].second; + Mask2[i] = Idx; + } + + return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); + } + + if (NumLo == 3 || NumHi == 3) { + // Otherwise, we must have three elements from one vector, call it X, and + // one element from the other, call it Y. First, use a shufps to build an + // intermediate vector with the one element from Y and the element from X + // that will be in the same half in the final destination (the indexes don't + // matter). Then, use a shufps to build the final vector, taking the half + // containing the element from Y from the intermediate, and the other half + // from X. + if (NumHi == 3) { + // Normalize it so the 3 elements come from V1. + CommuteVectorShuffleMask(PermMask, 4); + std::swap(V1, V2); + } + + // Find the element from V2. + unsigned HiIndex; + for (HiIndex = 0; HiIndex < 3; ++HiIndex) { + int Val = PermMask[HiIndex]; + if (Val < 0) + continue; + if (Val >= 4) + break; + } + + Mask1[0] = PermMask[HiIndex]; + Mask1[1] = -1; + Mask1[2] = PermMask[HiIndex^1]; + Mask1[3] = -1; + V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); + + if (HiIndex >= 2) { + Mask1[0] = PermMask[0]; + Mask1[1] = PermMask[1]; + Mask1[2] = HiIndex & 1 ? 6 : 4; + Mask1[3] = HiIndex & 1 ? 4 : 6; + return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); + } + + Mask1[0] = HiIndex & 1 ? 2 : 0; + Mask1[1] = HiIndex & 1 ? 0 : 2; + Mask1[2] = PermMask[2]; + Mask1[3] = PermMask[3]; + if (Mask1[2] >= 0) + Mask1[2] += 4; + if (Mask1[3] >= 0) + Mask1[3] += 4; + return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); + } + + // Break it into (shuffle shuffle_hi, shuffle_lo). + int LoMask[] = { -1, -1, -1, -1 }; + int HiMask[] = { -1, -1, -1, -1 }; + + int *MaskPtr = LoMask; + unsigned MaskIdx = 0; + unsigned LoIdx = 0; + unsigned HiIdx = 2; + for (unsigned i = 0; i != 4; ++i) { + if (i == 2) { + MaskPtr = HiMask; + MaskIdx = 1; + LoIdx = 0; + HiIdx = 2; + } + int Idx = PermMask[i]; + if (Idx < 0) { + Locs[i] = std::make_pair(-1, -1); + } else if (Idx < 4) { + Locs[i] = std::make_pair(MaskIdx, LoIdx); + MaskPtr[LoIdx] = Idx; + LoIdx++; + } else { + Locs[i] = std::make_pair(MaskIdx, HiIdx); + MaskPtr[HiIdx] = Idx; + HiIdx++; + } + } + + SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); + SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); + int MaskOps[] = { -1, -1, -1, -1 }; + for (unsigned i = 0; i != 4; ++i) + if (Locs[i].first != -1) + MaskOps[i] = Locs[i].first * 4 + Locs[i].second; + return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); +} + +static bool MayFoldVectorLoad(SDValue V) { + while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + + if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) + V = V.getOperand(0); + if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR && + V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF) + // BUILD_VECTOR (load), undef + V = V.getOperand(0); + + return MayFoldLoad(V); +} + +static +SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + + // Canonizalize to v2f64. + V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); + return DAG.getNode(ISD::BITCAST, dl, VT, + getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, + V1, DAG)); +} + +static +SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, + bool HasSSE2) { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + EVT VT = Op.getValueType(); + + assert(VT != MVT::v2i64 && "unsupported shuffle type"); + + if (HasSSE2 && VT == MVT::v2f64) + return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); + + // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1) + return DAG.getNode(ISD::BITCAST, dl, VT, + getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32, + DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1), + DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG)); +} + +static +SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + EVT VT = Op.getValueType(); + + assert((VT == MVT::v4i32 || VT == MVT::v4f32) && + "unsupported shuffle type"); + + if (V2.getOpcode() == ISD::UNDEF) + V2 = V1; + + // v4i32 or v4f32 + return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); +} + +static +SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + EVT VT = Op.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second + // operand of these instructions is only memory, so check if there's a + // potencial load folding here, otherwise use SHUFPS or MOVSD to match the + // same masks. + bool CanFoldLoad = false; + + // Trivial case, when V2 comes from a load. + if (MayFoldVectorLoad(V2)) + CanFoldLoad = true; + + // When V1 is a load, it can be folded later into a store in isel, example: + // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) + // turns into: + // (MOVLPSmr addr:$src1, VR128:$src2) + // So, recognize this potential and also use MOVLPS or MOVLPD + else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) + CanFoldLoad = true; + + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + if (CanFoldLoad) { + if (HasSSE2 && NumElems == 2) + return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); + + if (NumElems == 4) + // If we don't care about the second element, proceed to use movss. + if (SVOp->getMaskElt(1) != -1) + return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); + } + + // movl and movlp will both match v2i64, but v2i64 is never matched by + // movl earlier because we make it strict to avoid messing with the movlp load + // folding logic (see the code above getMOVLP call). Match it here then, + // this is horrible, but will stay like this until we move all shuffle + // matching to x86 specific nodes. Note that for the 1st condition all + // types are matched with movsd. + if (HasSSE2) { + // FIXME: isMOVLMask should be checked and matched before getMOVLP, + // as to remove this logic from here, as much as possible + if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT)) + return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); + return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); + } + + assert(VT != MVT::v4i32 && "unsupported shuffle type"); + + // Invert the operand order and use SHUFPS to match it. + return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1, + getShuffleSHUFImmediate(SVOp), DAG); +} + +// Reduce a vector shuffle to zext. +SDValue +X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { + // PMOVZX is only available from SSE41. + if (!Subtarget->hasSSE41()) + return SDValue(); + + EVT VT = Op.getValueType(); + + // Only AVX2 support 256-bit vector integer extending. + if (!Subtarget->hasInt256() && VT.is256BitVector()) + return SDValue(); + + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + DebugLoc DL = Op.getDebugLoc(); + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + unsigned NumElems = VT.getVectorNumElements(); + + // Extending is an unary operation and the element type of the source vector + // won't be equal to or larger than i64. + if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() || + VT.getVectorElementType() == MVT::i64) + return SDValue(); + + // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4. + unsigned Shift = 1; // Start from 2, i.e. 1 << 1. + while ((1U << Shift) < NumElems) { + if (SVOp->getMaskElt(1U << Shift) == 1) + break; + Shift += 1; + // The maximal ratio is 8, i.e. from i8 to i64. + if (Shift > 3) + return SDValue(); + } + + // Check the shuffle mask. + unsigned Mask = (1U << Shift) - 1; + for (unsigned i = 0; i != NumElems; ++i) { + int EltIdx = SVOp->getMaskElt(i); + if ((i & Mask) != 0 && EltIdx != -1) + return SDValue(); + if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift)) + return SDValue(); + } + + LLVMContext *Context = DAG.getContext(); + unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift; + EVT NeVT = EVT::getIntegerVT(*Context, NBits); + EVT NVT = EVT::getVectorVT(*Context, NeVT, NumElems >> Shift); + + if (!isTypeLegal(NVT)) + return SDValue(); + + // Simplify the operand as it's prepared to be fed into shuffle. + unsigned SignificantBits = NVT.getSizeInBits() >> Shift; + if (V1.getOpcode() == ISD::BITCAST && + V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && + V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && + V1.getOperand(0) + .getOperand(0).getValueType().getSizeInBits() == SignificantBits) { + // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) + SDValue V = V1.getOperand(0).getOperand(0).getOperand(0); + ConstantSDNode *CIdx = + dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1)); + // If it's foldable, i.e. normal load with single use, we will let code + // selection to fold it. Otherwise, we will short the conversion sequence. + if (CIdx && CIdx->getZExtValue() == 0 && + (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) { + if (V.getValueSizeInBits() > V1.getValueSizeInBits()) { + // The "ext_vec_elt" node is wider than the result node. + // In this case we should extract subvector from V. + // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)). + unsigned Ratio = V.getValueSizeInBits() / V1.getValueSizeInBits(); + EVT FullVT = V.getValueType(); + EVT SubVecVT = EVT::getVectorVT(*Context, + FullVT.getVectorElementType(), + FullVT.getVectorNumElements()/Ratio); + V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V, + DAG.getIntPtrConstant(0)); + } + V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V); + } + } + + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::VZEXT, DL, NVT, V1)); +} + +SDValue +X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + MVT VT = Op.getValueType().getSimpleVT(); + DebugLoc dl = Op.getDebugLoc(); + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + + if (isZeroShuffle(SVOp)) + return getZeroVector(VT, Subtarget, DAG, dl); + + // Handle splat operations + if (SVOp->isSplat()) { + // Use vbroadcast whenever the splat comes from a foldable load + SDValue Broadcast = LowerVectorBroadcast(Op, DAG); + if (Broadcast.getNode()) + return Broadcast; + } + + // Check integer expanding shuffles. + SDValue NewOp = LowerVectorIntExtend(Op, DAG); + if (NewOp.getNode()) + return NewOp; + + // If the shuffle can be profitably rewritten as a narrower shuffle, then + // do it! + if (VT == MVT::v8i16 || VT == MVT::v16i8 || + VT == MVT::v16i16 || VT == MVT::v32i8) { + SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); + if (NewOp.getNode()) + return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); + } else if ((VT == MVT::v4i32 || + (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { + // FIXME: Figure out a cleaner way to do this. + // Try to make use of movq to zero out the top part. + if (ISD::isBuildVectorAllZeros(V2.getNode())) { + SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); + if (NewOp.getNode()) { + MVT NewVT = NewOp.getValueType().getSimpleVT(); + if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), + NewVT, true, false)) + return getVZextMovL(VT, NewVT, NewOp.getOperand(0), + DAG, Subtarget, dl); + } + } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { + SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); + if (NewOp.getNode()) { + MVT NewVT = NewOp.getValueType().getSimpleVT(); + if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT)) + return getVZextMovL(VT, NewVT, NewOp.getOperand(1), + DAG, Subtarget, dl); + } + } + } + return SDValue(); +} + +SDValue +X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + MVT VT = Op.getValueType().getSimpleVT(); + DebugLoc dl = Op.getDebugLoc(); + unsigned NumElems = VT.getVectorNumElements(); + bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; + bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; + bool V1IsSplat = false; + bool V2IsSplat = false; + bool HasSSE2 = Subtarget->hasSSE2(); + bool HasFp256 = Subtarget->hasFp256(); + bool HasInt256 = Subtarget->hasInt256(); + MachineFunction &MF = DAG.getMachineFunction(); + bool OptForSize = MF.getFunction()->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + + assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); + + if (V1IsUndef && V2IsUndef) + return DAG.getUNDEF(VT); + + assert(!V1IsUndef && "Op 1 of shuffle should not be undef"); + + // Vector shuffle lowering takes 3 steps: + // + // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable + // narrowing and commutation of operands should be handled. + // 2) Matching of shuffles with known shuffle masks to x86 target specific + // shuffle nodes. + // 3) Rewriting of unmatched masks into new generic shuffle operations, + // so the shuffle can be broken into other shuffles and the legalizer can + // try the lowering again. + // + // The general idea is that no vector_shuffle operation should be left to + // be matched during isel, all of them must be converted to a target specific + // node here. + + // Normalize the input vectors. Here splats, zeroed vectors, profitable + // narrowing and commutation of operands should be handled. The actual code + // doesn't include all of those, work in progress... + SDValue NewOp = NormalizeVectorShuffle(Op, DAG); + if (NewOp.getNode()) + return NewOp; + + SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end()); + + // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and + // unpckh_undef). Only use pshufd if speed is more important than size. + if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256)) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); + if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256)) + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); + + if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() && + V2IsUndef && MayFoldVectorLoad(V1)) + return getMOVDDup(Op, dl, V1, DAG); + + if (isMOVHLPS_v_undef_Mask(M, VT)) + return getMOVHighToLow(Op, dl, DAG); + + // Use to match splats + if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef && + (VT == MVT::v2f64 || VT == MVT::v2i64)) + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); + + if (isPSHUFDMask(M, VT)) { + // The actual implementation will match the mask in the if above and then + // during isel it can match several different instructions, not only pshufd + // as its name says, sad but true, emulate the behavior for now... + if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) + return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); + + unsigned TargetMask = getShuffleSHUFImmediate(SVOp); + + if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) + return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); + + if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64)) + return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, + DAG); + + return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1, + TargetMask, DAG); + } + + // Check if this can be converted into a logical shift. + bool isLeft = false; + unsigned ShAmt = 0; + SDValue ShVal; + bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); + if (isShift && ShVal.hasOneUse()) { + // If the shifted value has multiple uses, it may be cheaper to use + // v_set0 + movlhps or movhlps, etc. + MVT EltVT = VT.getVectorElementType(); + ShAmt *= EltVT.getSizeInBits(); + return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); + } + + if (isMOVLMask(M, VT)) { + if (ISD::isBuildVectorAllZeros(V1.getNode())) + return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); + if (!isMOVLPMask(M, VT)) { + if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) + return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); + + if (VT == MVT::v4i32 || VT == MVT::v4f32) + return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); + } + } + + // FIXME: fold these into legal mask. + if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256)) + return getMOVLowToHigh(Op, dl, DAG, HasSSE2); + + if (isMOVHLPSMask(M, VT)) + return getMOVHighToLow(Op, dl, DAG); + + if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget)) + return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); + + if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget)) + return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); + + if (isMOVLPMask(M, VT)) + return getMOVLP(Op, dl, DAG, HasSSE2); + + if (ShouldXformToMOVHLPS(M, VT) || + ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT)) + return CommuteVectorShuffle(SVOp, DAG); + + if (isShift) { + // No better options. Use a vshldq / vsrldq. + MVT EltVT = VT.getVectorElementType(); + ShAmt *= EltVT.getSizeInBits(); + return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); + } + + bool Commuted = false; + // FIXME: This should also accept a bitcast of a splat? Be careful, not + // 1,1,1,1 -> v8i16 though. + V1IsSplat = isSplatVector(V1.getNode()); + V2IsSplat = isSplatVector(V2.getNode()); + + // Canonicalize the splat or undef, if present, to be on the RHS. + if (!V2IsUndef && V1IsSplat && !V2IsSplat) { + CommuteVectorShuffleMask(M, NumElems); + std::swap(V1, V2); + std::swap(V1IsSplat, V2IsSplat); + Commuted = true; + } + + if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) { + // Shuffling low element of v1 into undef, just return v1. + if (V2IsUndef) + return V1; + // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which + // the instruction selector will not match, so get a canonical MOVL with + // swapped operands to undo the commute. + return getMOVL(DAG, dl, VT, V2, V1); + } + + if (isUNPCKLMask(M, VT, HasInt256)) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); + + if (isUNPCKHMask(M, VT, HasInt256)) + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); + + if (V2IsSplat) { + // Normalize mask so all entries that point to V2 points to its first + // element then try to match unpck{h|l} again. If match, return a + // new vector_shuffle with the corrected mask.p + SmallVector<int, 8> NewMask(M.begin(), M.end()); + NormalizeMask(NewMask, NumElems); + if (isUNPCKLMask(NewMask, VT, HasInt256, true)) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); + if (isUNPCKHMask(NewMask, VT, HasInt256, true)) + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); + } + + if (Commuted) { + // Commute is back and try unpck* again. + // FIXME: this seems wrong. + CommuteVectorShuffleMask(M, NumElems); + std::swap(V1, V2); + std::swap(V1IsSplat, V2IsSplat); + Commuted = false; + + if (isUNPCKLMask(M, VT, HasInt256)) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); + + if (isUNPCKHMask(M, VT, HasInt256)) + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); + } + + // Normalize the node to match x86 shuffle ops if needed + if (!V2IsUndef && (isSHUFPMask(M, VT, HasFp256, /* Commuted */ true))) + return CommuteVectorShuffle(SVOp, DAG); + + // The checks below are all present in isShuffleMaskLegal, but they are + // inlined here right now to enable us to directly emit target specific + // nodes, and remove one by one until they don't return Op anymore. + + if (isPALIGNRMask(M, VT, Subtarget)) + return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2, + getShufflePALIGNRImmediate(SVOp), + DAG); + + if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && + SVOp->getSplatIndex() == 0 && V2IsUndef) { + if (VT == MVT::v2f64 || VT == MVT::v2i64) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); + } + + if (isPSHUFHWMask(M, VT, HasInt256)) + return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, + getShufflePSHUFHWImmediate(SVOp), + DAG); + + if (isPSHUFLWMask(M, VT, HasInt256)) + return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, + getShufflePSHUFLWImmediate(SVOp), + DAG); + + if (isSHUFPMask(M, VT, HasFp256)) + return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, + getShuffleSHUFImmediate(SVOp), DAG); + + if (isUNPCKL_v_undef_Mask(M, VT, HasInt256)) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); + if (isUNPCKH_v_undef_Mask(M, VT, HasInt256)) + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); + + //===--------------------------------------------------------------------===// + // Generate target specific nodes for 128 or 256-bit shuffles only + // supported in the AVX instruction set. + // + + // Handle VMOVDDUPY permutations + if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256)) + return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); + + // Handle VPERMILPS/D* permutations + if (isVPERMILPMask(M, VT, HasFp256)) { + if (HasInt256 && VT == MVT::v8i32) + return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, + getShuffleSHUFImmediate(SVOp), DAG); + return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, + getShuffleSHUFImmediate(SVOp), DAG); + } + + // Handle VPERM2F128/VPERM2I128 permutations + if (isVPERM2X128Mask(M, VT, HasFp256)) + return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, + V2, getShuffleVPERM2X128Immediate(SVOp), DAG); + + SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG); + if (BlendOp.getNode()) + return BlendOp; + + if (V2IsUndef && HasInt256 && (VT == MVT::v8i32 || VT == MVT::v8f32)) { + SmallVector<SDValue, 8> permclMask; + for (unsigned i = 0; i != 8; ++i) { + permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32)); + } + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, + &permclMask[0], 8); + // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32 + return DAG.getNode(X86ISD::VPERMV, dl, VT, + DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1); + } + + if (V2IsUndef && HasInt256 && (VT == MVT::v4i64 || VT == MVT::v4f64)) + return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, + getShuffleCLImmediate(SVOp), DAG); + + //===--------------------------------------------------------------------===// + // Since no target specific shuffle was selected for this generic one, + // lower it into other known shuffles. FIXME: this isn't true yet, but + // this is the plan. + // + + // Handle v8i16 specifically since SSE can do byte extraction and insertion. + if (VT == MVT::v8i16) { + SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG); + if (NewOp.getNode()) + return NewOp; + } + + if (VT == MVT::v16i8) { + SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); + if (NewOp.getNode()) + return NewOp; + } + + if (VT == MVT::v32i8) { + SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG); + if (NewOp.getNode()) + return NewOp; + } + + // Handle all 128-bit wide vectors with 4 elements, and match them with + // several different shuffle types. + if (NumElems == 4 && VT.is128BitVector()) + return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG); + + // Handle general 256-bit shuffles + if (VT.is256BitVector()) + return LowerVECTOR_SHUFFLE_256(SVOp, DAG); + + return SDValue(); +} + +static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getValueType().getSimpleVT(); + DebugLoc dl = Op.getDebugLoc(); + + if (!Op.getOperand(0).getValueType().getSimpleVT().is128BitVector()) + return SDValue(); + + if (VT.getSizeInBits() == 8) { + SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, + Op.getOperand(0), Op.getOperand(1)); + SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, + DAG.getValueType(VT)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); + } + + if (VT.getSizeInBits() == 16) { + unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + // If Idx is 0, it's cheaper to do a move instead of a pextrw. + if (Idx == 0) + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + DAG.getNode(ISD::BITCAST, dl, + MVT::v4i32, + Op.getOperand(0)), + Op.getOperand(1))); + SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, + Op.getOperand(0), Op.getOperand(1)); + SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, + DAG.getValueType(VT)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); + } + + if (VT == MVT::f32) { + // EXTRACTPS outputs to a GPR32 register which will require a movd to copy + // the result back to FR32 register. It's only worth matching if the + // result has a single use which is a store or a bitcast to i32. And in + // the case of a store, it's not worth it if the index is a constant 0, + // because a MOVSSmr can be used instead, which is smaller and faster. + if (!Op.hasOneUse()) + return SDValue(); + SDNode *User = *Op.getNode()->use_begin(); + if ((User->getOpcode() != ISD::STORE || + (isa<ConstantSDNode>(Op.getOperand(1)) && + cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && + (User->getOpcode() != ISD::BITCAST || + User->getValueType(0) != MVT::i32)) + return SDValue(); + SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, + Op.getOperand(0)), + Op.getOperand(1)); + return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); + } + + if (VT == MVT::i32 || VT == MVT::i64) { + // ExtractPS/pextrq works with constant index. + if (isa<ConstantSDNode>(Op.getOperand(1))) + return Op; + } + return SDValue(); +} + +SDValue +X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + if (!isa<ConstantSDNode>(Op.getOperand(1))) + return SDValue(); + + SDValue Vec = Op.getOperand(0); + MVT VecVT = Vec.getValueType().getSimpleVT(); + + // If this is a 256-bit vector result, first extract the 128-bit vector and + // then extract the element from the 128-bit vector. + if (VecVT.is256BitVector()) { + DebugLoc dl = Op.getNode()->getDebugLoc(); + unsigned NumElems = VecVT.getVectorNumElements(); + SDValue Idx = Op.getOperand(1); + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + + // Get the 128-bit vector. + Vec = Extract128BitVector(Vec, IdxVal, DAG, dl); + + if (IdxVal >= NumElems/2) + IdxVal -= NumElems/2; + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, + DAG.getConstant(IdxVal, MVT::i32)); + } + + assert(VecVT.is128BitVector() && "Unexpected vector length"); + + if (Subtarget->hasSSE41()) { + SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); + if (Res.getNode()) + return Res; + } + + MVT VT = Op.getValueType().getSimpleVT(); + DebugLoc dl = Op.getDebugLoc(); + // TODO: handle v16i8. + if (VT.getSizeInBits() == 16) { + SDValue Vec = Op.getOperand(0); + unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + if (Idx == 0) + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + DAG.getNode(ISD::BITCAST, dl, + MVT::v4i32, Vec), + Op.getOperand(1))); + // Transform it so it match pextrw which produces a 32-bit result. + MVT EltVT = MVT::i32; + SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, + Op.getOperand(0), Op.getOperand(1)); + SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, + DAG.getValueType(VT)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); + } + + if (VT.getSizeInBits() == 32) { + unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + if (Idx == 0) + return Op; + + // SHUFPS the element to the lowest double word, then movss. + int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 }; + MVT VVT = Op.getOperand(0).getValueType().getSimpleVT(); + SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), + DAG.getUNDEF(VVT), Mask); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, + DAG.getIntPtrConstant(0)); + } + + if (VT.getSizeInBits() == 64) { + // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b + // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught + // to match extract_elt for f64. + unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + if (Idx == 0) + return Op; + + // UNPCKHPD the element to the lowest double word, then movsd. + // Note if the lower 64 bits of the result of the UNPCKHPD is then stored + // to a f64mem, the whole operation is folded into a single MOVHPDmr. + int Mask[2] = { 1, -1 }; + MVT VVT = Op.getOperand(0).getValueType().getSimpleVT(); + SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), + DAG.getUNDEF(VVT), Mask); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, + DAG.getIntPtrConstant(0)); + } + + return SDValue(); +} + +static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getValueType().getSimpleVT(); + MVT EltVT = VT.getVectorElementType(); + DebugLoc dl = Op.getDebugLoc(); + + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + SDValue N2 = Op.getOperand(2); + + if (!VT.is128BitVector()) + return SDValue(); + + if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && + isa<ConstantSDNode>(N2)) { + unsigned Opc; + if (VT == MVT::v8i16) + Opc = X86ISD::PINSRW; + else if (VT == MVT::v16i8) + Opc = X86ISD::PINSRB; + else + Opc = X86ISD::PINSRB; + + // Transform it so it match pinsr{b,w} which expects a GR32 as its second + // argument. + if (N1.getValueType() != MVT::i32) + N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); + if (N2.getValueType() != MVT::i32) + N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); + return DAG.getNode(Opc, dl, VT, N0, N1, N2); + } + + if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { + // Bits [7:6] of the constant are the source select. This will always be + // zero here. The DAG Combiner may combine an extract_elt index into these + // bits. For example (insert (extract, 3), 2) could be matched by putting + // the '3' into bits [7:6] of X86ISD::INSERTPS. + // Bits [5:4] of the constant are the destination select. This is the + // value of the incoming immediate. + // Bits [3:0] of the constant are the zero mask. The DAG Combiner may + // combine either bitwise AND or insert of float 0.0 to set these bits. + N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); + // Create this as a scalar to vector.. + N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); + } + + if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) { + // PINSR* works with constant index. + return Op; + } + return SDValue(); +} + +SDValue +X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { + MVT VT = Op.getValueType().getSimpleVT(); + MVT EltVT = VT.getVectorElementType(); + + DebugLoc dl = Op.getDebugLoc(); + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + SDValue N2 = Op.getOperand(2); + + // If this is a 256-bit vector result, first extract the 128-bit vector, + // insert the element into the extracted half and then place it back. + if (VT.is256BitVector()) { + if (!isa<ConstantSDNode>(N2)) + return SDValue(); + + // Get the desired 128-bit vector half. + unsigned NumElems = VT.getVectorNumElements(); + unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); + SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); + + // Insert the element into the desired half. + bool Upper = IdxVal >= NumElems/2; + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, + DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32)); + + // Insert the changed part back to the 256-bit vector + return Insert128BitVector(N0, V, IdxVal, DAG, dl); + } + + if (Subtarget->hasSSE41()) + return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); + + if (EltVT == MVT::i8) + return SDValue(); + + if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { + // Transform it so it match pinsrw which expects a 16-bit value in a GR32 + // as its second argument. + if (N1.getValueType() != MVT::i32) + N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); + if (N2.getValueType() != MVT::i32) + N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); + return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); + } + return SDValue(); +} + +static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { + LLVMContext *Context = DAG.getContext(); + DebugLoc dl = Op.getDebugLoc(); + MVT OpVT = Op.getValueType().getSimpleVT(); + + // If this is a 256-bit vector result, first insert into a 128-bit + // vector and then insert into the 256-bit vector. + if (!OpVT.is128BitVector()) { + // Insert into a 128-bit vector. + EVT VT128 = EVT::getVectorVT(*Context, + OpVT.getVectorElementType(), + OpVT.getVectorNumElements() / 2); + + Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); + + // Insert the 128-bit vector. + return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); + } + + if (OpVT == MVT::v1i64 && + Op.getOperand(0).getValueType() == MVT::i64) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); + + SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); + assert(OpVT.is128BitVector() && "Expected an SSE type!"); + return DAG.getNode(ISD::BITCAST, dl, OpVT, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); +} + +// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in +// a simple subregister reference or explicit instructions to grab +// upper bits of a vector. +static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + if (Subtarget->hasFp256()) { + DebugLoc dl = Op.getNode()->getDebugLoc(); + SDValue Vec = Op.getNode()->getOperand(0); + SDValue Idx = Op.getNode()->getOperand(1); + + if (Op.getNode()->getValueType(0).is128BitVector() && + Vec.getNode()->getValueType(0).is256BitVector() && + isa<ConstantSDNode>(Idx)) { + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + return Extract128BitVector(Vec, IdxVal, DAG, dl); + } + } + return SDValue(); +} + +// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a +// simple superregister reference or explicit instructions to insert +// the upper bits of a vector. +static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + if (Subtarget->hasFp256()) { + DebugLoc dl = Op.getNode()->getDebugLoc(); + SDValue Vec = Op.getNode()->getOperand(0); + SDValue SubVec = Op.getNode()->getOperand(1); + SDValue Idx = Op.getNode()->getOperand(2); + + if (Op.getNode()->getValueType(0).is256BitVector() && + SubVec.getNode()->getValueType(0).is128BitVector() && + isa<ConstantSDNode>(Idx)) { + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); + } + } + return SDValue(); +} + +// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as +// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is +// one of the above mentioned nodes. It has to be wrapped because otherwise +// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only +// be used to form addressing mode. These wrapped nodes will be selected +// into MOV32ri. +SDValue +X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { + ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); + + // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the + // global base reg. + unsigned char OpFlag = 0; + unsigned WrapperKind = X86ISD::Wrapper; + CodeModel::Model M = getTargetMachine().getCodeModel(); + + if (Subtarget->isPICStyleRIPRel() && + (M == CodeModel::Small || M == CodeModel::Kernel)) + WrapperKind = X86ISD::WrapperRIP; + else if (Subtarget->isPICStyleGOT()) + OpFlag = X86II::MO_GOTOFF; + else if (Subtarget->isPICStyleStubPIC()) + OpFlag = X86II::MO_PIC_BASE_OFFSET; + + SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), + CP->getAlignment(), + CP->getOffset(), OpFlag); + DebugLoc DL = CP->getDebugLoc(); + Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); + // With PIC, the address is actually $g + Offset. + if (OpFlag) { + Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), + DAG.getNode(X86ISD::GlobalBaseReg, + DebugLoc(), getPointerTy()), + Result); + } + + return Result; +} + +SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { + JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); + + // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the + // global base reg. + unsigned char OpFlag = 0; + unsigned WrapperKind = X86ISD::Wrapper; + CodeModel::Model M = getTargetMachine().getCodeModel(); + + if (Subtarget->isPICStyleRIPRel() && + (M == CodeModel::Small || M == CodeModel::Kernel)) + WrapperKind = X86ISD::WrapperRIP; + else if (Subtarget->isPICStyleGOT()) + OpFlag = X86II::MO_GOTOFF; + else if (Subtarget->isPICStyleStubPIC()) + OpFlag = X86II::MO_PIC_BASE_OFFSET; + + SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), + OpFlag); + DebugLoc DL = JT->getDebugLoc(); + Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); + + // With PIC, the address is actually $g + Offset. + if (OpFlag) + Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), + DAG.getNode(X86ISD::GlobalBaseReg, + DebugLoc(), getPointerTy()), + Result); + + return Result; +} + +SDValue +X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { + const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); + + // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the + // global base reg. + unsigned char OpFlag = 0; + unsigned WrapperKind = X86ISD::Wrapper; + CodeModel::Model M = getTargetMachine().getCodeModel(); + + if (Subtarget->isPICStyleRIPRel() && + (M == CodeModel::Small || M == CodeModel::Kernel)) { + if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF()) + OpFlag = X86II::MO_GOTPCREL; + WrapperKind = X86ISD::WrapperRIP; + } else if (Subtarget->isPICStyleGOT()) { + OpFlag = X86II::MO_GOT; + } else if (Subtarget->isPICStyleStubPIC()) { + OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE; + } else if (Subtarget->isPICStyleStubNoDynamic()) { + OpFlag = X86II::MO_DARWIN_NONLAZY; + } + + SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); + + DebugLoc DL = Op.getDebugLoc(); + Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); + + // With PIC, the address is actually $g + Offset. + if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && + !Subtarget->is64Bit()) { + Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), + DAG.getNode(X86ISD::GlobalBaseReg, + DebugLoc(), getPointerTy()), + Result); + } + + // For symbols that require a load from a stub to get the address, emit the + // load. + if (isGlobalStubReference(OpFlag)) + Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result, + MachinePointerInfo::getGOT(), false, false, false, 0); + + return Result; +} + +SDValue +X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { + // Create the TargetBlockAddressAddress node. + unsigned char OpFlags = + Subtarget->ClassifyBlockAddressReference(); + CodeModel::Model M = getTargetMachine().getCodeModel(); + const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); + int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset(); + DebugLoc dl = Op.getDebugLoc(); + SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset, + OpFlags); + + if (Subtarget->isPICStyleRIPRel() && + (M == CodeModel::Small || M == CodeModel::Kernel)) + Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); + else + Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); + + // With PIC, the address is actually $g + Offset. + if (isGlobalRelativeToPICBase(OpFlags)) { + Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), + DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), + Result); + } + + return Result; +} + +SDValue +X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, + int64_t Offset, SelectionDAG &DAG) const { + // Create the TargetGlobalAddress node, folding in the constant + // offset if it is legal. + unsigned char OpFlags = + Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); + CodeModel::Model M = getTargetMachine().getCodeModel(); + SDValue Result; + if (OpFlags == X86II::MO_NO_FLAG && + X86::isOffsetSuitableForCodeModel(Offset, M)) { + // A direct static reference to a global. + Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); + Offset = 0; + } else { + Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); + } + + if (Subtarget->isPICStyleRIPRel() && + (M == CodeModel::Small || M == CodeModel::Kernel)) + Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); + else + Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); + + // With PIC, the address is actually $g + Offset. + if (isGlobalRelativeToPICBase(OpFlags)) { + Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), + DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), + Result); + } + + // For globals that require a load from a stub to get the address, emit the + // load. + if (isGlobalStubReference(OpFlags)) + Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, + MachinePointerInfo::getGOT(), false, false, false, 0); + + // If there was a non-zero offset that we didn't fold, create an explicit + // addition for it. + if (Offset != 0) + Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, + DAG.getConstant(Offset, getPointerTy())); + + return Result; +} + +SDValue +X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { + const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); + int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); + return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); +} + +static SDValue +GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, + SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, + unsigned char OperandFlags, bool LocalDynamic = false) { + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + DebugLoc dl = GA->getDebugLoc(); + SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, + GA->getValueType(0), + GA->getOffset(), + OperandFlags); + + X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR + : X86ISD::TLSADDR; + + if (InFlag) { + SDValue Ops[] = { Chain, TGA, *InFlag }; + Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops)); + } else { + SDValue Ops[] = { Chain, TGA }; + Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops)); + } + + // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. + MFI->setAdjustsStack(true); + + SDValue Flag = Chain.getValue(1); + return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); +} + +// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit +static SDValue +LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, + const EVT PtrVT) { + SDValue InFlag; + DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better + SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, + DAG.getNode(X86ISD::GlobalBaseReg, + DebugLoc(), PtrVT), InFlag); + InFlag = Chain.getValue(1); + + return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); +} + +// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit +static SDValue +LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, + const EVT PtrVT) { + return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, + X86::RAX, X86II::MO_TLSGD); +} + +static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, + SelectionDAG &DAG, + const EVT PtrVT, + bool is64Bit) { + DebugLoc dl = GA->getDebugLoc(); + + // Get the start address of the TLS block for this module. + X86MachineFunctionInfo* MFI = DAG.getMachineFunction() + .getInfo<X86MachineFunctionInfo>(); + MFI->incNumLocalDynamicTLSAccesses(); + + SDValue Base; + if (is64Bit) { + Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX, + X86II::MO_TLSLD, /*LocalDynamic=*/true); + } else { + SDValue InFlag; + SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, + DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), InFlag); + InFlag = Chain.getValue(1); + Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, + X86II::MO_TLSLDM, /*LocalDynamic=*/true); + } + + // Note: the CleanupLocalDynamicTLSPass will remove redundant computations + // of Base. + + // Build x@dtpoff. + unsigned char OperandFlags = X86II::MO_DTPOFF; + unsigned WrapperKind = X86ISD::Wrapper; + SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, + GA->getValueType(0), + GA->getOffset(), OperandFlags); + SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); + + // Add x@dtpoff with the base. + return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base); +} + +// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model. +static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, + const EVT PtrVT, TLSModel::Model model, + bool is64Bit, bool isPIC) { + DebugLoc dl = GA->getDebugLoc(); + + // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). + Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), + is64Bit ? 257 : 256)); + + SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), + DAG.getIntPtrConstant(0), + MachinePointerInfo(Ptr), + false, false, false, 0); + + unsigned char OperandFlags = 0; + // Most TLS accesses are not RIP relative, even on x86-64. One exception is + // initialexec. + unsigned WrapperKind = X86ISD::Wrapper; + if (model == TLSModel::LocalExec) { + OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; + } else if (model == TLSModel::InitialExec) { + if (is64Bit) { + OperandFlags = X86II::MO_GOTTPOFF; + WrapperKind = X86ISD::WrapperRIP; + } else { + OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF; + } + } else { + llvm_unreachable("Unexpected model"); + } + + // emit "addl x@ntpoff,%eax" (local exec) + // or "addl x@indntpoff,%eax" (initial exec) + // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) + SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, + GA->getValueType(0), + GA->getOffset(), OperandFlags); + SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); + + if (model == TLSModel::InitialExec) { + if (isPIC && !is64Bit) { + Offset = DAG.getNode(ISD::ADD, dl, PtrVT, + DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), + Offset); + } + + Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, + MachinePointerInfo::getGOT(), false, false, false, + 0); + } + + // The address of the thread local variable is the add of the thread + // pointer with the offset of the variable. + return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); +} + +SDValue +X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { + + GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + const GlobalValue *GV = GA->getGlobal(); + + if (Subtarget->isTargetELF()) { + TLSModel::Model model = getTargetMachine().getTLSModel(GV); + + switch (model) { + case TLSModel::GeneralDynamic: + if (Subtarget->is64Bit()) + return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); + return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); + case TLSModel::LocalDynamic: + return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(), + Subtarget->is64Bit()); + case TLSModel::InitialExec: + case TLSModel::LocalExec: + return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, + Subtarget->is64Bit(), + getTargetMachine().getRelocationModel() == Reloc::PIC_); + } + llvm_unreachable("Unknown TLS model."); + } + + if (Subtarget->isTargetDarwin()) { + // Darwin only has one model of TLS. Lower to that. + unsigned char OpFlag = 0; + unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? + X86ISD::WrapperRIP : X86ISD::Wrapper; + + // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the + // global base reg. + bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && + !Subtarget->is64Bit(); + if (PIC32) + OpFlag = X86II::MO_TLVP_PIC_BASE; + else + OpFlag = X86II::MO_TLVP; + DebugLoc DL = Op.getDebugLoc(); + SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, + GA->getValueType(0), + GA->getOffset(), OpFlag); + SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); + + // With PIC32, the address is actually $g + Offset. + if (PIC32) + Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), + DAG.getNode(X86ISD::GlobalBaseReg, + DebugLoc(), getPointerTy()), + Offset); + + // Lowering the machine isd will make sure everything is in the right + // location. + SDValue Chain = DAG.getEntryNode(); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Args[] = { Chain, Offset }; + Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); + + // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setAdjustsStack(true); + + // And our return value (tls address) is in the standard call return value + // location. + unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; + return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(), + Chain.getValue(1)); + } + + if (Subtarget->isTargetWindows() || Subtarget->isTargetMingw()) { + // Just use the implicit TLS architecture + // Need to generate someting similar to: + // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage + // ; from TEB + // mov ecx, dword [rel _tls_index]: Load index (from C runtime) + // mov rcx, qword [rdx+rcx*8] + // mov eax, .tls$:tlsvar + // [rax+rcx] contains the address + // Windows 64bit: gs:0x58 + // Windows 32bit: fs:__tls_array + + // If GV is an alias then use the aliasee for determining + // thread-localness. + if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) + GV = GA->resolveAliasedGlobal(false); + DebugLoc dl = GA->getDebugLoc(); + SDValue Chain = DAG.getEntryNode(); + + // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or + // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly + // use its literal value of 0x2C. + Value *Ptr = Constant::getNullValue(Subtarget->is64Bit() + ? Type::getInt8PtrTy(*DAG.getContext(), + 256) + : Type::getInt32PtrTy(*DAG.getContext(), + 257)); + + SDValue TlsArray = Subtarget->is64Bit() ? DAG.getIntPtrConstant(0x58) : + (Subtarget->isTargetMingw() ? DAG.getIntPtrConstant(0x2C) : + DAG.getExternalSymbol("_tls_array", getPointerTy())); + + SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, TlsArray, + MachinePointerInfo(Ptr), + false, false, false, 0); + + // Load the _tls_index variable + SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy()); + if (Subtarget->is64Bit()) + IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, + IDX, MachinePointerInfo(), MVT::i32, + false, false, 0); + else + IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(), + false, false, false, 0); + + SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), + getPointerTy()); + IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale); + + SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX); + res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(), + false, false, false, 0); + + // Get the offset of start of .tls section + SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, + GA->getValueType(0), + GA->getOffset(), X86II::MO_SECREL); + SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA); + + // The address of the thread local variable is the add of the thread + // pointer with the offset of the variable. + return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset); + } + + llvm_unreachable("TLS not implemented for this target."); +} + +/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values +/// and take a 2 x i32 value to shift plus a shift amount. +SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{ + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + DebugLoc dl = Op.getDebugLoc(); + bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, + DAG.getConstant(VTBits - 1, MVT::i8)) + : DAG.getConstant(0, VT); + + SDValue Tmp2, Tmp3; + if (Op.getOpcode() == ISD::SHL_PARTS) { + Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); + Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + } else { + Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); + Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); + } + + SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, + DAG.getConstant(VTBits, MVT::i8)); + SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, + AndNode, DAG.getConstant(0, MVT::i8)); + + SDValue Hi, Lo; + SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); + SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; + SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; + + if (Op.getOpcode() == ISD::SHL_PARTS) { + Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); + Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); + } else { + Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); + Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); + } + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, array_lengthof(Ops), dl); +} + +SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + EVT SrcVT = Op.getOperand(0).getValueType(); + + if (SrcVT.isVector()) + return SDValue(); + + assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && + "Unknown SINT_TO_FP to lower!"); + + // These are really Legal; return the operand so the caller accepts it as + // Legal. + if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) + return Op; + if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && + Subtarget->is64Bit()) { + return Op; + } + + DebugLoc dl = Op.getDebugLoc(); + unsigned Size = SrcVT.getSizeInBits()/8; + MachineFunction &MF = DAG.getMachineFunction(); + int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); + SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), + StackSlot, + MachinePointerInfo::getFixedStack(SSFI), + false, false, 0); + return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); +} + +SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, + SDValue StackSlot, + SelectionDAG &DAG) const { + // Build the FILD + DebugLoc DL = Op.getDebugLoc(); + SDVTList Tys; + bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); + if (useSSE) + Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); + else + Tys = DAG.getVTList(Op.getValueType(), MVT::Other); + + unsigned ByteSize = SrcVT.getSizeInBits()/8; + + FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); + MachineMemOperand *MMO; + if (FI) { + int SSFI = FI->getIndex(); + MMO = + DAG.getMachineFunction() + .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), + MachineMemOperand::MOLoad, ByteSize, ByteSize); + } else { + MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); + StackSlot = StackSlot.getOperand(1); + } + SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; + SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : + X86ISD::FILD, DL, + Tys, Ops, array_lengthof(Ops), + SrcVT, MMO); + + if (useSSE) { + Chain = Result.getValue(1); + SDValue InFlag = Result.getValue(2); + + // FIXME: Currently the FST is flagged to the FILD_FLAG. This + // shouldn't be necessary except that RFP cannot be live across + // multiple blocks. When stackifier is fixed, they can be uncoupled. + MachineFunction &MF = DAG.getMachineFunction(); + unsigned SSFISize = Op.getValueType().getSizeInBits()/8; + int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); + SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + Tys = DAG.getVTList(MVT::Other); + SDValue Ops[] = { + Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag + }; + MachineMemOperand *MMO = + DAG.getMachineFunction() + .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), + MachineMemOperand::MOStore, SSFISize, SSFISize); + + Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, + Ops, array_lengthof(Ops), + Op.getValueType(), MMO); + Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, + MachinePointerInfo::getFixedStack(SSFI), + false, false, false, 0); + } + + return Result; +} + +// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. +SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, + SelectionDAG &DAG) const { + // This algorithm is not obvious. Here it is what we're trying to output: + /* + movq %rax, %xmm0 + punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } + subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } + #ifdef __SSE3__ + haddpd %xmm0, %xmm0 + #else + pshufd $0x4e, %xmm0, %xmm1 + addpd %xmm1, %xmm0 + #endif + */ + + DebugLoc dl = Op.getDebugLoc(); + LLVMContext *Context = DAG.getContext(); + + // Build some magic constants. + const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; + Constant *C0 = ConstantDataVector::get(*Context, CV0); + SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); + + SmallVector<Constant*,2> CV1; + CV1.push_back( + ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, + APInt(64, 0x4330000000000000ULL)))); + CV1.push_back( + ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, + APInt(64, 0x4530000000000000ULL)))); + Constant *C1 = ConstantVector::get(CV1); + SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); + + // Load the 64-bit value into an XMM register. + SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, + Op.getOperand(0)); + SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, + MachinePointerInfo::getConstantPool(), + false, false, false, 16); + SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1), + CLod0); + + SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, + MachinePointerInfo::getConstantPool(), + false, false, false, 16); + SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1); + SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); + SDValue Result; + + if (Subtarget->hasSSE3()) { + // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'. + Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); + } else { + SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub); + SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32, + S2F, 0x4E, DAG); + Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, + DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle), + Sub); + } + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, + DAG.getIntPtrConstant(0)); +} + +// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. +SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc dl = Op.getDebugLoc(); + // FP constant to bias correct the final result. + SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), + MVT::f64); + + // Load the 32-bit value into an XMM register. + SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, + Op.getOperand(0)); + + // Zero out the upper parts of the register. + Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); + + Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, + DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), + DAG.getIntPtrConstant(0)); + + // Or the load with the bias. + SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, + MVT::v2f64, Load)), + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, + MVT::v2f64, Bias))); + Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, + DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), + DAG.getIntPtrConstant(0)); + + // Subtract the bias. + SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); + + // Handle final rounding. + EVT DestVT = Op.getValueType(); + + if (DestVT.bitsLT(MVT::f64)) + return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, + DAG.getIntPtrConstant(0)); + if (DestVT.bitsGT(MVT::f64)) + return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); + + // Handle final rounding. + return Sub; +} + +SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, + SelectionDAG &DAG) const { + SDValue N0 = Op.getOperand(0); + EVT SVT = N0.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + + assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 || + SVT == MVT::v8i8 || SVT == MVT::v8i16) && + "Custom UINT_TO_FP is not supported!"); + + EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, + SVT.getVectorNumElements()); + return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), + DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); +} + +SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + SDValue N0 = Op.getOperand(0); + DebugLoc dl = Op.getDebugLoc(); + + if (Op.getValueType().isVector()) + return lowerUINT_TO_FP_vec(Op, DAG); + + // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't + // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform + // the optimization here. + if (DAG.SignBitIsZero(N0)) + return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); + + EVT SrcVT = N0.getValueType(); + EVT DstVT = Op.getValueType(); + if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) + return LowerUINT_TO_FP_i64(Op, DAG); + if (SrcVT == MVT::i32 && X86ScalarSSEf64) + return LowerUINT_TO_FP_i32(Op, DAG); + if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) + return SDValue(); + + // Make a 64-bit buffer, and use it to build an FILD. + SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); + if (SrcVT == MVT::i32) { + SDValue WordOff = DAG.getConstant(4, getPointerTy()); + SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, + getPointerTy(), StackSlot, WordOff); + SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), + StackSlot, MachinePointerInfo(), + false, false, 0); + SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), + OffsetSlot, MachinePointerInfo(), + false, false, 0); + SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); + return Fild; + } + + assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), + StackSlot, MachinePointerInfo(), + false, false, 0); + // For i64 source, we need to add the appropriate power of 2 if the input + // was negative. This is the same as the optimization in + // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, + // we must be careful to do the computation in x87 extended precision, not + // in SSE. (The generic code can't know it's OK to do this, or how to.) + int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); + MachineMemOperand *MMO = + DAG.getMachineFunction() + .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), + MachineMemOperand::MOLoad, 8, 8); + + SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); + SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; + SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, + array_lengthof(Ops), MVT::i64, MMO); + + APInt FF(32, 0x5F800000ULL); + + // Check whether the sign bit is set. + SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), + Op.getOperand(0), DAG.getConstant(0, MVT::i64), + ISD::SETLT); + + // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. + SDValue FudgePtr = DAG.getConstantPool( + ConstantInt::get(*DAG.getContext(), FF.zext(64)), + getPointerTy()); + + // Get a pointer to FF if the sign bit was set, or to 0 otherwise. + SDValue Zero = DAG.getIntPtrConstant(0); + SDValue Four = DAG.getIntPtrConstant(4); + SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, + Zero, Four); + FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); + + // Load the value out, extending it from f32 to f80. + // FIXME: Avoid the extend by constructing the right constant pool? + SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), + FudgePtr, MachinePointerInfo::getConstantPool(), + MVT::f32, false, false, 4); + // Extend everything to 80 bits to force it to be done on x87. + SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); + return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); +} + +std::pair<SDValue,SDValue> +X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, + bool IsSigned, bool IsReplace) const { + DebugLoc DL = Op.getDebugLoc(); + + EVT DstTy = Op.getValueType(); + + if (!IsSigned && !isIntegerTypeFTOL(DstTy)) { + assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); + DstTy = MVT::i64; + } + + assert(DstTy.getSimpleVT() <= MVT::i64 && + DstTy.getSimpleVT() >= MVT::i16 && + "Unknown FP_TO_INT to lower!"); + + // These are really Legal. + if (DstTy == MVT::i32 && + isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) + return std::make_pair(SDValue(), SDValue()); + if (Subtarget->is64Bit() && + DstTy == MVT::i64 && + isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) + return std::make_pair(SDValue(), SDValue()); + + // We lower FP->int64 either into FISTP64 followed by a load from a temporary + // stack slot, or into the FTOL runtime function. + MachineFunction &MF = DAG.getMachineFunction(); + unsigned MemSize = DstTy.getSizeInBits()/8; + int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); + SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + + unsigned Opc; + if (!IsSigned && isIntegerTypeFTOL(DstTy)) + Opc = X86ISD::WIN_FTOL; + else + switch (DstTy.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); + case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; + case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; + case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; + } + + SDValue Chain = DAG.getEntryNode(); + SDValue Value = Op.getOperand(0); + EVT TheVT = Op.getOperand(0).getValueType(); + // FIXME This causes a redundant load/store if the SSE-class value is already + // in memory, such as if it is on the callstack. + if (isScalarFPTypeInSSEReg(TheVT)) { + assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); + Chain = DAG.getStore(Chain, DL, Value, StackSlot, + MachinePointerInfo::getFixedStack(SSFI), + false, false, 0); + SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); + SDValue Ops[] = { + Chain, StackSlot, DAG.getValueType(TheVT) + }; + + MachineMemOperand *MMO = + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), + MachineMemOperand::MOLoad, MemSize, MemSize); + Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, + array_lengthof(Ops), DstTy, MMO); + Chain = Value.getValue(1); + SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); + StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + } + + MachineMemOperand *MMO = + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), + MachineMemOperand::MOStore, MemSize, MemSize); + + if (Opc != X86ISD::WIN_FTOL) { + // Build the FP_TO_INT*_IN_MEM + SDValue Ops[] = { Chain, Value, StackSlot }; + SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), + Ops, array_lengthof(Ops), DstTy, + MMO); + return std::make_pair(FIST, StackSlot); + } else { + SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL, + DAG.getVTList(MVT::Other, MVT::Glue), + Chain, Value); + SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX, + MVT::i32, ftol.getValue(1)); + SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX, + MVT::i32, eax.getValue(2)); + SDValue Ops[] = { eax, edx }; + SDValue pair = IsReplace + ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, array_lengthof(Ops)) + : DAG.getMergeValues(Ops, array_lengthof(Ops), DL); + return std::make_pair(pair, SDValue()); + } +} + +static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + MVT VT = Op->getValueType(0).getSimpleVT(); + SDValue In = Op->getOperand(0); + MVT InVT = In.getValueType().getSimpleVT(); + DebugLoc dl = Op->getDebugLoc(); + + // Optimize vectors in AVX mode: + // + // v8i16 -> v8i32 + // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32. + // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. + // Concat upper and lower parts. + // + // v4i32 -> v4i64 + // Use vpunpckldq for 4 lower elements v4i32 -> v2i64. + // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. + // Concat upper and lower parts. + // + + if (((VT != MVT::v8i32) || (InVT != MVT::v8i16)) && + ((VT != MVT::v4i64) || (InVT != MVT::v4i32))) + return SDValue(); + + if (Subtarget->hasInt256()) + return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, In); + + SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl); + SDValue Undef = DAG.getUNDEF(InVT); + bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND; + SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); + SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); + + MVT HVT = MVT::getVectorVT(VT.getVectorElementType(), + VT.getVectorNumElements()/2); + + OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); +} + +SDValue X86TargetLowering::LowerANY_EXTEND(SDValue Op, + SelectionDAG &DAG) const { + if (Subtarget->hasFp256()) { + SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); + if (Res.getNode()) + return Res; + } + + return SDValue(); +} +SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + MVT VT = Op.getValueType().getSimpleVT(); + SDValue In = Op.getOperand(0); + MVT SVT = In.getValueType().getSimpleVT(); + + if (Subtarget->hasFp256()) { + SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); + if (Res.getNode()) + return Res; + } + + if (!VT.is256BitVector() || !SVT.is128BitVector() || + VT.getVectorNumElements() != SVT.getVectorNumElements()) + return SDValue(); + + assert(Subtarget->hasFp256() && "256-bit vector is observed without AVX!"); + + // AVX2 has better support of integer extending. + if (Subtarget->hasInt256()) + return DAG.getNode(X86ISD::VZEXT, DL, VT, In); + + SDValue Lo = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, In); + static const int Mask[] = {4, 5, 6, 7, -1, -1, -1, -1}; + SDValue Hi = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, + DAG.getVectorShuffle(MVT::v8i16, DL, In, + DAG.getUNDEF(MVT::v8i16), + &Mask[0])); + + return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i32, Lo, Hi); +} + +SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + MVT VT = Op.getValueType().getSimpleVT(); + SDValue In = Op.getOperand(0); + MVT SVT = In.getValueType().getSimpleVT(); + + if ((VT == MVT::v4i32) && (SVT == MVT::v4i64)) { + // On AVX2, v4i64 -> v4i32 becomes VPERMD. + if (Subtarget->hasInt256()) { + static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; + In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In); + In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32), + ShufMask); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, + DAG.getIntPtrConstant(0)); + } + + // On AVX, v4i64 -> v4i32 becomes a sequence that uses PSHUFD and MOVLHPS. + SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, + DAG.getIntPtrConstant(0)); + SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, + DAG.getIntPtrConstant(2)); + + OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi); + + // The PSHUFD mask: + static const int ShufMask1[] = {0, 2, 0, 0}; + SDValue Undef = DAG.getUNDEF(VT); + OpLo = DAG.getVectorShuffle(VT, DL, OpLo, Undef, ShufMask1); + OpHi = DAG.getVectorShuffle(VT, DL, OpHi, Undef, ShufMask1); + + // The MOVLHPS mask: + static const int ShufMask2[] = {0, 1, 4, 5}; + return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask2); + } + + if ((VT == MVT::v8i16) && (SVT == MVT::v8i32)) { + // On AVX2, v8i32 -> v8i16 becomed PSHUFB. + if (Subtarget->hasInt256()) { + In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In); + + SmallVector<SDValue,32> pshufbMask; + for (unsigned i = 0; i < 2; ++i) { + pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8)); + for (unsigned j = 0; j < 8; ++j) + pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); + } + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, + &pshufbMask[0], 32); + In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV); + In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In); + + static const int ShufMask[] = {0, 2, -1, -1}; + In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64), + &ShufMask[0]); + In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, + DAG.getIntPtrConstant(0)); + return DAG.getNode(ISD::BITCAST, DL, VT, In); + } + + SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, + DAG.getIntPtrConstant(0)); + + SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, + DAG.getIntPtrConstant(4)); + + OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi); + + // The PSHUFB mask: + static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, + -1, -1, -1, -1, -1, -1, -1, -1}; + + SDValue Undef = DAG.getUNDEF(MVT::v16i8); + OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1); + OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1); + + OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi); + + // The MOVLHPS Mask: + static const int ShufMask2[] = {0, 1, 4, 5}; + SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2); + return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res); + } + + // Handle truncation of V256 to V128 using shuffles. + if (!VT.is128BitVector() || !SVT.is256BitVector()) + return SDValue(); + + assert(VT.getVectorNumElements() != SVT.getVectorNumElements() && + "Invalid op"); + assert(Subtarget->hasFp256() && "256-bit vector without AVX!"); + + unsigned NumElems = VT.getVectorNumElements(); + EVT NVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + NumElems * 2); + + SmallVector<int, 16> MaskVec(NumElems * 2, -1); + // Prepare truncation shuffle mask + for (unsigned i = 0; i != NumElems; ++i) + MaskVec[i] = i * 2; + SDValue V = DAG.getVectorShuffle(NVT, DL, + DAG.getNode(ISD::BITCAST, DL, NVT, In), + DAG.getUNDEF(NVT), &MaskVec[0]); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, + DAG.getIntPtrConstant(0)); +} + +SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, + SelectionDAG &DAG) const { + MVT VT = Op.getValueType().getSimpleVT(); + if (VT.isVector()) { + if (VT == MVT::v8i16) + return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), VT, + DAG.getNode(ISD::FP_TO_SINT, Op.getDebugLoc(), + MVT::v8i32, Op.getOperand(0))); + return SDValue(); + } + + std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, + /*IsSigned=*/ true, /*IsReplace=*/ false); + SDValue FIST = Vals.first, StackSlot = Vals.second; + // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. + if (FIST.getNode() == 0) return Op; + + if (StackSlot.getNode()) + // Load the result. + return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), + FIST, StackSlot, MachinePointerInfo(), + false, false, false, 0); + + // The node is the result. + return FIST; +} + +SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, + SelectionDAG &DAG) const { + std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, + /*IsSigned=*/ false, /*IsReplace=*/ false); + SDValue FIST = Vals.first, StackSlot = Vals.second; + assert(FIST.getNode() && "Unexpected failure"); + + if (StackSlot.getNode()) + // Load the result. + return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), + FIST, StackSlot, MachinePointerInfo(), + false, false, false, 0); + + // The node is the result. + return FIST; +} + +static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { + DebugLoc DL = Op.getDebugLoc(); + MVT VT = Op.getValueType().getSimpleVT(); + SDValue In = Op.getOperand(0); + MVT SVT = In.getValueType().getSimpleVT(); + + assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); + + return DAG.getNode(X86ISD::VFPEXT, DL, VT, + DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, + In, DAG.getUNDEF(SVT))); +} + +SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { + LLVMContext *Context = DAG.getContext(); + DebugLoc dl = Op.getDebugLoc(); + MVT VT = Op.getValueType().getSimpleVT(); + MVT EltVT = VT; + unsigned NumElts = VT == MVT::f64 ? 2 : 4; + if (VT.isVector()) { + EltVT = VT.getVectorElementType(); + NumElts = VT.getVectorNumElements(); + } + Constant *C; + if (EltVT == MVT::f64) + C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, + APInt(64, ~(1ULL << 63)))); + else + C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle, + APInt(32, ~(1U << 31)))); + C = ConstantVector::getSplat(NumElts, C); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); + unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); + SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(), + false, false, false, Alignment); + if (VT.isVector()) { + MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getNode(ISD::AND, dl, ANDVT, + DAG.getNode(ISD::BITCAST, dl, ANDVT, + Op.getOperand(0)), + DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask))); + } + return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); +} + +SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { + LLVMContext *Context = DAG.getContext(); + DebugLoc dl = Op.getDebugLoc(); + MVT VT = Op.getValueType().getSimpleVT(); + MVT EltVT = VT; + unsigned NumElts = VT == MVT::f64 ? 2 : 4; + if (VT.isVector()) { + EltVT = VT.getVectorElementType(); + NumElts = VT.getVectorNumElements(); + } + Constant *C; + if (EltVT == MVT::f64) + C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, + APInt(64, 1ULL << 63))); + else + C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle, + APInt(32, 1U << 31))); + C = ConstantVector::getSplat(NumElts, C); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); + unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); + SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(), + false, false, false, Alignment); + if (VT.isVector()) { + MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getNode(ISD::XOR, dl, XORVT, + DAG.getNode(ISD::BITCAST, dl, XORVT, + Op.getOperand(0)), + DAG.getNode(ISD::BITCAST, dl, XORVT, Mask))); + } + + return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); +} + +SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { + LLVMContext *Context = DAG.getContext(); + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + DebugLoc dl = Op.getDebugLoc(); + MVT VT = Op.getValueType().getSimpleVT(); + MVT SrcVT = Op1.getValueType().getSimpleVT(); + + // If second operand is smaller, extend it first. + if (SrcVT.bitsLT(VT)) { + Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); + SrcVT = VT; + } + // And if it is bigger, shrink it first. + if (SrcVT.bitsGT(VT)) { + Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); + SrcVT = VT; + } + + // At this point the operands and the result should have the same + // type, and that won't be f80 since that is not custom lowered. + + // First get the sign bit of second operand. + SmallVector<Constant*,4> CV; + if (SrcVT == MVT::f64) { + const fltSemantics &Sem = APFloat::IEEEdouble; + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63)))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0)))); + } else { + const fltSemantics &Sem = APFloat::IEEEsingle; + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31)))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); + } + Constant *C = ConstantVector::get(CV); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(), + false, false, false, 16); + SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); + + // Shift sign bit right or left if the two operands have different types. + if (SrcVT.bitsGT(VT)) { + // Op0 is MVT::f32, Op1 is MVT::f64. + SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); + SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, + DAG.getConstant(32, MVT::i32)); + SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); + SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, + DAG.getIntPtrConstant(0)); + } + + // Clear first operand sign bit. + CV.clear(); + if (VT == MVT::f64) { + const fltSemantics &Sem = APFloat::IEEEdouble; + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, + APInt(64, ~(1ULL << 63))))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0)))); + } else { + const fltSemantics &Sem = APFloat::IEEEsingle; + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, + APInt(32, ~(1U << 31))))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); + } + C = ConstantVector::get(CV); + CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(), + false, false, false, 16); + SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); + + // Or the value with the sign bit. + return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); +} + +static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { + SDValue N0 = Op.getOperand(0); + DebugLoc dl = Op.getDebugLoc(); + MVT VT = Op.getValueType().getSimpleVT(); + + // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). + SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, + DAG.getConstant(1, VT)); + return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); +} + +// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able. +// +SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); + + if (!Subtarget->hasSSE41()) + return SDValue(); + + if (!Op->hasOneUse()) + return SDValue(); + + SDNode *N = Op.getNode(); + DebugLoc DL = N->getDebugLoc(); + + SmallVector<SDValue, 8> Opnds; + DenseMap<SDValue, unsigned> VecInMap; + EVT VT = MVT::Other; + + // Recognize a special case where a vector is casted into wide integer to + // test all 0s. + Opnds.push_back(N->getOperand(0)); + Opnds.push_back(N->getOperand(1)); + + for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { + SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot; + // BFS traverse all OR'd operands. + if (I->getOpcode() == ISD::OR) { + Opnds.push_back(I->getOperand(0)); + Opnds.push_back(I->getOperand(1)); + // Re-evaluate the number of nodes to be traversed. + e += 2; // 2 more nodes (LHS and RHS) are pushed. + continue; + } + + // Quit if a non-EXTRACT_VECTOR_ELT + if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // Quit if without a constant index. + SDValue Idx = I->getOperand(1); + if (!isa<ConstantSDNode>(Idx)) + return SDValue(); + + SDValue ExtractedFromVec = I->getOperand(0); + DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec); + if (M == VecInMap.end()) { + VT = ExtractedFromVec.getValueType(); + // Quit if not 128/256-bit vector. + if (!VT.is128BitVector() && !VT.is256BitVector()) + return SDValue(); + // Quit if not the same type. + if (VecInMap.begin() != VecInMap.end() && + VT != VecInMap.begin()->first.getValueType()) + return SDValue(); + M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first; + } + M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue(); + } + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Not extracted from 128-/256-bit vector."); + + unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U; + SmallVector<SDValue, 8> VecIns; + + for (DenseMap<SDValue, unsigned>::const_iterator + I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) { + // Quit if not all elements are used. + if (I->second != FullMask) + return SDValue(); + VecIns.push_back(I->first); + } + + EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; + + // Cast all vectors into TestVT for PTEST. + for (unsigned i = 0, e = VecIns.size(); i < e; ++i) + VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]); + + // If more than one full vectors are evaluated, OR them first before PTEST. + for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { + // Each iteration will OR 2 nodes and append the result until there is only + // 1 node left, i.e. the final OR'd value of all vectors. + SDValue LHS = VecIns[Slot]; + SDValue RHS = VecIns[Slot + 1]; + VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); + } + + return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, + VecIns.back(), VecIns.back()); +} + +/// Emit nodes that will be selected as "test Op0,Op0", or something +/// equivalent. +SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, + SelectionDAG &DAG) const { + DebugLoc dl = Op.getDebugLoc(); + + // CF and OF aren't always set the way we want. Determine which + // of these we need. + bool NeedCF = false; + bool NeedOF = false; + switch (X86CC) { + default: break; + case X86::COND_A: case X86::COND_AE: + case X86::COND_B: case X86::COND_BE: + NeedCF = true; + break; + case X86::COND_G: case X86::COND_GE: + case X86::COND_L: case X86::COND_LE: + case X86::COND_O: case X86::COND_NO: + NeedOF = true; + break; + } + + // See if we can use the EFLAGS value from the operand instead of + // doing a separate TEST. TEST always sets OF and CF to 0, so unless + // we prove that the arithmetic won't overflow, we can't use OF or CF. + if (Op.getResNo() != 0 || NeedOF || NeedCF) + // Emit a CMP with 0, which is the TEST pattern. + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, + DAG.getConstant(0, Op.getValueType())); + + unsigned Opcode = 0; + unsigned NumOperands = 0; + + // Truncate operations may prevent the merge of the SETCC instruction + // and the arithmetic intruction before it. Attempt to truncate the operands + // of the arithmetic instruction and use a reduced bit-width instruction. + bool NeedTruncation = false; + SDValue ArithOp = Op; + if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) { + SDValue Arith = Op->getOperand(0); + // Both the trunc and the arithmetic op need to have one user each. + if (Arith->hasOneUse()) + switch (Arith.getOpcode()) { + default: break; + case ISD::ADD: + case ISD::SUB: + case ISD::AND: + case ISD::OR: + case ISD::XOR: { + NeedTruncation = true; + ArithOp = Arith; + } + } + } + + // NOTICE: In the code below we use ArithOp to hold the arithmetic operation + // which may be the result of a CAST. We use the variable 'Op', which is the + // non-casted variable when we check for possible users. + switch (ArithOp.getOpcode()) { + case ISD::ADD: + // Due to an isel shortcoming, be conservative if this add is likely to be + // selected as part of a load-modify-store instruction. When the root node + // in a match is a store, isel doesn't know how to remap non-chain non-flag + // uses of other nodes in the match, such as the ADD in this case. This + // leads to the ADD being left around and reselected, with the result being + // two adds in the output. Alas, even if none our users are stores, that + // doesn't prove we're O.K. Ergo, if we have any parents that aren't + // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require + // climbing the DAG back to the root, and it doesn't seem to be worth the + // effort. + for (SDNode::use_iterator UI = Op.getNode()->use_begin(), + UE = Op.getNode()->use_end(); UI != UE; ++UI) + if (UI->getOpcode() != ISD::CopyToReg && + UI->getOpcode() != ISD::SETCC && + UI->getOpcode() != ISD::STORE) + goto default_case; + + if (ConstantSDNode *C = + dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) { + // An add of one will be selected as an INC. + if (C->getAPIntValue() == 1) { + Opcode = X86ISD::INC; + NumOperands = 1; + break; + } + + // An add of negative one (subtract of one) will be selected as a DEC. + if (C->getAPIntValue().isAllOnesValue()) { + Opcode = X86ISD::DEC; + NumOperands = 1; + break; + } + } + + // Otherwise use a regular EFLAGS-setting add. + Opcode = X86ISD::ADD; + NumOperands = 2; + break; + case ISD::AND: { + // If the primary and result isn't used, don't bother using X86ISD::AND, + // because a TEST instruction will be better. + bool NonFlagUse = false; + for (SDNode::use_iterator UI = Op.getNode()->use_begin(), + UE = Op.getNode()->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + unsigned UOpNo = UI.getOperandNo(); + if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { + // Look pass truncate. + UOpNo = User->use_begin().getOperandNo(); + User = *User->use_begin(); + } + + if (User->getOpcode() != ISD::BRCOND && + User->getOpcode() != ISD::SETCC && + !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) { + NonFlagUse = true; + break; + } + } + + if (!NonFlagUse) + break; + } + // FALL THROUGH + case ISD::SUB: + case ISD::OR: + case ISD::XOR: + // Due to the ISEL shortcoming noted above, be conservative if this op is + // likely to be selected as part of a load-modify-store instruction. + for (SDNode::use_iterator UI = Op.getNode()->use_begin(), + UE = Op.getNode()->use_end(); UI != UE; ++UI) + if (UI->getOpcode() == ISD::STORE) + goto default_case; + + // Otherwise use a regular EFLAGS-setting instruction. + switch (ArithOp.getOpcode()) { + default: llvm_unreachable("unexpected operator!"); + case ISD::SUB: Opcode = X86ISD::SUB; break; + case ISD::XOR: Opcode = X86ISD::XOR; break; + case ISD::AND: Opcode = X86ISD::AND; break; + case ISD::OR: { + if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { + SDValue EFLAGS = LowerVectorAllZeroTest(Op, DAG); + if (EFLAGS.getNode()) + return EFLAGS; + } + Opcode = X86ISD::OR; + break; + } + } + + NumOperands = 2; + break; + case X86ISD::ADD: + case X86ISD::SUB: + case X86ISD::INC: + case X86ISD::DEC: + case X86ISD::OR: + case X86ISD::XOR: + case X86ISD::AND: + return SDValue(Op.getNode(), 1); + default: + default_case: + break; + } + + // If we found that truncation is beneficial, perform the truncation and + // update 'Op'. + if (NeedTruncation) { + EVT VT = Op.getValueType(); + SDValue WideVal = Op->getOperand(0); + EVT WideVT = WideVal.getValueType(); + unsigned ConvertedOp = 0; + // Use a target machine opcode to prevent further DAGCombine + // optimizations that may separate the arithmetic operations + // from the setcc node. + switch (WideVal.getOpcode()) { + default: break; + case ISD::ADD: ConvertedOp = X86ISD::ADD; break; + case ISD::SUB: ConvertedOp = X86ISD::SUB; break; + case ISD::AND: ConvertedOp = X86ISD::AND; break; + case ISD::OR: ConvertedOp = X86ISD::OR; break; + case ISD::XOR: ConvertedOp = X86ISD::XOR; break; + } + + if (ConvertedOp) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) { + SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0)); + SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1)); + Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1); + } + } + } + + if (Opcode == 0) + // Emit a CMP with 0, which is the TEST pattern. + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, + DAG.getConstant(0, Op.getValueType())); + + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); + SmallVector<SDValue, 4> Ops; + for (unsigned i = 0; i != NumOperands; ++i) + Ops.push_back(Op.getOperand(i)); + + SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); + DAG.ReplaceAllUsesWith(Op, New); + return SDValue(New.getNode(), 1); +} + +/// Emit nodes that will be selected as "cmp Op0,Op1", or something +/// equivalent. +SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, + SelectionDAG &DAG) const { + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) + if (C->getAPIntValue() == 0) + return EmitTest(Op0, X86CC, DAG); + + DebugLoc dl = Op0.getDebugLoc(); + if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || + Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { + // Use SUB instead of CMP to enable CSE between SUB and CMP. + SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); + SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, + Op0, Op1); + return SDValue(Sub.getNode(), 1); + } + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); +} + +/// Convert a comparison if required by the subtarget. +SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, + SelectionDAG &DAG) const { + // If the subtarget does not support the FUCOMI instruction, floating-point + // comparisons have to be converted. + if (Subtarget->hasCMov() || + Cmp.getOpcode() != X86ISD::CMP || + !Cmp.getOperand(0).getValueType().isFloatingPoint() || + !Cmp.getOperand(1).getValueType().isFloatingPoint()) + return Cmp; + + // The instruction selector will select an FUCOM instruction instead of + // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence + // build an SDNode sequence that transfers the result from FPSW into EFLAGS: + // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8)))) + DebugLoc dl = Cmp.getDebugLoc(); + SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); + SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); + SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, + DAG.getConstant(8, MVT::i8)); + SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); + return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); +} + +static bool isAllOnes(SDValue V) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); + return C && C->isAllOnesValue(); +} + +/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node +/// if it's possible. +SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, + DebugLoc dl, SelectionDAG &DAG) const { + SDValue Op0 = And.getOperand(0); + SDValue Op1 = And.getOperand(1); + if (Op0.getOpcode() == ISD::TRUNCATE) + Op0 = Op0.getOperand(0); + if (Op1.getOpcode() == ISD::TRUNCATE) + Op1 = Op1.getOperand(0); + + SDValue LHS, RHS; + if (Op1.getOpcode() == ISD::SHL) + std::swap(Op0, Op1); + if (Op0.getOpcode() == ISD::SHL) { + if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) + if (And00C->getZExtValue() == 1) { + // If we looked past a truncate, check that it's only truncating away + // known zeros. + unsigned BitWidth = Op0.getValueSizeInBits(); + unsigned AndBitWidth = And.getValueSizeInBits(); + if (BitWidth > AndBitWidth) { + APInt Zeros, Ones; + DAG.ComputeMaskedBits(Op0, Zeros, Ones); + if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) + return SDValue(); + } + LHS = Op1; + RHS = Op0.getOperand(1); + } + } else if (Op1.getOpcode() == ISD::Constant) { + ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); + uint64_t AndRHSVal = AndRHS->getZExtValue(); + SDValue AndLHS = Op0; + + if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { + LHS = AndLHS.getOperand(0); + RHS = AndLHS.getOperand(1); + } + + // Use BT if the immediate can't be encoded in a TEST instruction. + if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) { + LHS = AndLHS; + RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType()); + } + } + + if (LHS.getNode()) { + // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT + // instruction. Since the shift amount is in-range-or-undefined, we know + // that doing a bittest on the i32 value is ok. We extend to i32 because + // the encoding for the i16 version is larger than the i32 version. + // Also promote i16 to i32 for performance / code size reason. + if (LHS.getValueType() == MVT::i8 || + LHS.getValueType() == MVT::i16) + LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); + + // If the operand types disagree, extend the shift amount to match. Since + // BT ignores high bits (like shifts) we can use anyextend. + if (LHS.getValueType() != RHS.getValueType()) + RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); + + SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); + X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; + return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(Cond, MVT::i8), BT); + } + + return SDValue(); +} + +// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 +// ones, and then concatenate the result back. +static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getValueType().getSimpleVT(); + + assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && + "Unsupported value type for operation"); + + unsigned NumElems = VT.getVectorNumElements(); + DebugLoc dl = Op.getDebugLoc(); + SDValue CC = Op.getOperand(2); + + // Extract the LHS vectors + SDValue LHS = Op.getOperand(0); + SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); + SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); + + // Extract the RHS vectors + SDValue RHS = Op.getOperand(1); + SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); + SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); + + // Issue the operation on the smaller types and concatenate the result back + MVT EltVT = VT.getVectorElementType(); + MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), + DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); +} + +static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDValue Cond; + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue CC = Op.getOperand(2); + MVT VT = Op.getValueType().getSimpleVT(); + ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); + bool isFP = Op.getOperand(1).getValueType().getSimpleVT().isFloatingPoint(); + DebugLoc dl = Op.getDebugLoc(); + + if (isFP) { +#ifndef NDEBUG + MVT EltVT = Op0.getValueType().getVectorElementType().getSimpleVT(); + assert(EltVT == MVT::f32 || EltVT == MVT::f64); +#endif + + unsigned SSECC; + bool Swap = false; + + // SSE Condition code mapping: + // 0 - EQ + // 1 - LT + // 2 - LE + // 3 - UNORD + // 4 - NEQ + // 5 - NLT + // 6 - NLE + // 7 - ORD + switch (SetCCOpcode) { + default: llvm_unreachable("Unexpected SETCC condition"); + case ISD::SETOEQ: + case ISD::SETEQ: SSECC = 0; break; + case ISD::SETOGT: + case ISD::SETGT: Swap = true; // Fallthrough + case ISD::SETLT: + case ISD::SETOLT: SSECC = 1; break; + case ISD::SETOGE: + case ISD::SETGE: Swap = true; // Fallthrough + case ISD::SETLE: + case ISD::SETOLE: SSECC = 2; break; + case ISD::SETUO: SSECC = 3; break; + case ISD::SETUNE: + case ISD::SETNE: SSECC = 4; break; + case ISD::SETULE: Swap = true; // Fallthrough + case ISD::SETUGE: SSECC = 5; break; + case ISD::SETULT: Swap = true; // Fallthrough + case ISD::SETUGT: SSECC = 6; break; + case ISD::SETO: SSECC = 7; break; + case ISD::SETUEQ: + case ISD::SETONE: SSECC = 8; break; + } + if (Swap) + std::swap(Op0, Op1); + + // In the two special cases we can't handle, emit two comparisons. + if (SSECC == 8) { + unsigned CC0, CC1; + unsigned CombineOpc; + if (SetCCOpcode == ISD::SETUEQ) { + CC0 = 3; CC1 = 0; CombineOpc = ISD::OR; + } else { + assert(SetCCOpcode == ISD::SETONE); + CC0 = 7; CC1 = 4; CombineOpc = ISD::AND; + } + + SDValue Cmp0 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + DAG.getConstant(CC0, MVT::i8)); + SDValue Cmp1 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + DAG.getConstant(CC1, MVT::i8)); + return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); + } + // Handle all other FP comparisons here. + return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + DAG.getConstant(SSECC, MVT::i8)); + } + + // Break 256-bit integer vector compare into smaller ones. + if (VT.is256BitVector() && !Subtarget->hasInt256()) + return Lower256IntVSETCC(Op, DAG); + + // We are handling one of the integer comparisons here. Since SSE only has + // GT and EQ comparisons for integer, swapping operands and multiple + // operations may be required for some comparisons. + unsigned Opc; + bool Swap = false, Invert = false, FlipSigns = false; + + switch (SetCCOpcode) { + default: llvm_unreachable("Unexpected SETCC condition"); + case ISD::SETNE: Invert = true; + case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; + case ISD::SETLT: Swap = true; + case ISD::SETGT: Opc = X86ISD::PCMPGT; break; + case ISD::SETGE: Swap = true; + case ISD::SETLE: Opc = X86ISD::PCMPGT; Invert = true; break; + case ISD::SETULT: Swap = true; + case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break; + case ISD::SETUGE: Swap = true; + case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break; + } + if (Swap) + std::swap(Op0, Op1); + + // Check that the operation in question is available (most are plain SSE2, + // but PCMPGTQ and PCMPEQQ have different requirements). + if (VT == MVT::v2i64) { + if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) { + assert(Subtarget->hasSSE2() && "Don't know how to lower!"); + + // First cast everything to the right type. + Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); + Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); + + // Since SSE has no unsigned integer comparisons, we need to flip the sign + // bits of the inputs before performing those operations. The lower + // compare is always unsigned. + SDValue SB; + if (FlipSigns) { + SB = DAG.getConstant(0x80000000U, MVT::v4i32); + } else { + SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32); + SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32); + SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, + Sign, Zero, Sign, Zero); + } + Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB); + Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB); + + // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2)) + SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); + SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1); + + // Create masks for only the low parts/high parts of the 64 bit integers. + const int MaskHi[] = { 1, 1, 3, 3 }; + const int MaskLo[] = { 0, 0, 2, 2 }; + SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi); + SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo); + SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); + + SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo); + Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi); + + if (Invert) + Result = DAG.getNOT(dl, Result, MVT::v4i32); + + return DAG.getNode(ISD::BITCAST, dl, VT, Result); + } + + if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) { + // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with + // pcmpeqd + pshufd + pand. + assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!"); + + // First cast everything to the right type. + Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); + Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); + + // Do the compare. + SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1); + + // Make sure the lower and upper halves are both all-ones. + const int Mask[] = { 1, 0, 3, 2 }; + SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask); + Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf); + + if (Invert) + Result = DAG.getNOT(dl, Result, MVT::v4i32); + + return DAG.getNode(ISD::BITCAST, dl, VT, Result); + } + } + + // Since SSE has no unsigned integer comparisons, we need to flip the sign + // bits of the inputs before performing those operations. + if (FlipSigns) { + EVT EltVT = VT.getVectorElementType(); + SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT); + Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB); + Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB); + } + + SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); + + // If the logical-not of the result is required, perform that now. + if (Invert) + Result = DAG.getNOT(dl, Result, VT); + + return Result; +} + +SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { + + MVT VT = Op.getValueType().getSimpleVT(); + + if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); + + assert(VT == MVT::i8 && "SetCC type must be 8-bit integer"); + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + DebugLoc dl = Op.getDebugLoc(); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); + + // Optimize to BT if possible. + // Lower (X & (1 << N)) == 0 to BT(X, N). + // Lower ((X >>u N) & 1) != 0 to BT(X, N). + // Lower ((X >>s N) & 1) != 0 to BT(X, N). + if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && + Op1.getOpcode() == ISD::Constant && + cast<ConstantSDNode>(Op1)->isNullValue() && + (CC == ISD::SETEQ || CC == ISD::SETNE)) { + SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); + if (NewSetCC.getNode()) + return NewSetCC; + } + + // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of + // these. + if (Op1.getOpcode() == ISD::Constant && + (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || + cast<ConstantSDNode>(Op1)->isNullValue()) && + (CC == ISD::SETEQ || CC == ISD::SETNE)) { + + // If the input is a setcc, then reuse the input setcc or use a new one with + // the inverted condition. + if (Op0.getOpcode() == X86ISD::SETCC) { + X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); + bool Invert = (CC == ISD::SETNE) ^ + cast<ConstantSDNode>(Op1)->isNullValue(); + if (!Invert) return Op0; + + CCode = X86::GetOppositeBranchCondition(CCode); + return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); + } + } + + bool isFP = Op1.getValueType().getSimpleVT().isFloatingPoint(); + unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); + if (X86CC == X86::COND_INVALID) + return SDValue(); + + SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); + EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); + return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86CC, MVT::i8), EFLAGS); +} + +// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. +static bool isX86LogicalCmp(SDValue Op) { + unsigned Opc = Op.getNode()->getOpcode(); + if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || + Opc == X86ISD::SAHF) + return true; + if (Op.getResNo() == 1 && + (Opc == X86ISD::ADD || + Opc == X86ISD::SUB || + Opc == X86ISD::ADC || + Opc == X86ISD::SBB || + Opc == X86ISD::SMUL || + Opc == X86ISD::UMUL || + Opc == X86ISD::INC || + Opc == X86ISD::DEC || + Opc == X86ISD::OR || + Opc == X86ISD::XOR || + Opc == X86ISD::AND)) + return true; + + if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) + return true; + + return false; +} + +static bool isZero(SDValue V) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); + return C && C->isNullValue(); +} + +static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { + if (V.getOpcode() != ISD::TRUNCATE) + return false; + + SDValue VOp0 = V.getOperand(0); + unsigned InBits = VOp0.getValueSizeInBits(); + unsigned Bits = V.getValueSizeInBits(); + return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); +} + +SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { + bool addTest = true; + SDValue Cond = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue Op2 = Op.getOperand(2); + DebugLoc DL = Op.getDebugLoc(); + SDValue CC; + + if (Cond.getOpcode() == ISD::SETCC) { + SDValue NewCond = LowerSETCC(Cond, DAG); + if (NewCond.getNode()) + Cond = NewCond; + } + + // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y + // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y + // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y + // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y + if (Cond.getOpcode() == X86ISD::SETCC && + Cond.getOperand(1).getOpcode() == X86ISD::CMP && + isZero(Cond.getOperand(1).getOperand(1))) { + SDValue Cmp = Cond.getOperand(1); + + unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); + + if ((isAllOnes(Op1) || isAllOnes(Op2)) && + (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { + SDValue Y = isAllOnes(Op2) ? Op1 : Op2; + + SDValue CmpOp0 = Cmp.getOperand(0); + // Apply further optimizations for special cases + // (select (x != 0), -1, 0) -> neg & sbb + // (select (x == 0), 0, -1) -> neg & sbb + if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y)) + if (YC->isNullValue() && + (isAllOnes(Op1) == (CondCode == X86::COND_NE))) { + SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); + SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, + DAG.getConstant(0, CmpOp0.getValueType()), + CmpOp0); + SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), + DAG.getConstant(X86::COND_B, MVT::i8), + SDValue(Neg.getNode(), 1)); + return Res; + } + + Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, + CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); + Cmp = ConvertCmpIfNecessary(Cmp, DAG); + + SDValue Res = // Res = 0 or -1. + DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), + DAG.getConstant(X86::COND_B, MVT::i8), Cmp); + + if (isAllOnes(Op1) != (CondCode == X86::COND_E)) + Res = DAG.getNOT(DL, Res, Res.getValueType()); + + ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); + if (N2C == 0 || !N2C->isNullValue()) + Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); + return Res; + } + } + + // Look past (and (setcc_carry (cmp ...)), 1). + if (Cond.getOpcode() == ISD::AND && + Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); + if (C && C->getAPIntValue() == 1) + Cond = Cond.getOperand(0); + } + + // If condition flag is set by a X86ISD::CMP, then use it as the condition + // setting operand in place of the X86ISD::SETCC. + unsigned CondOpcode = Cond.getOpcode(); + if (CondOpcode == X86ISD::SETCC || + CondOpcode == X86ISD::SETCC_CARRY) { + CC = Cond.getOperand(0); + + SDValue Cmp = Cond.getOperand(1); + unsigned Opc = Cmp.getOpcode(); + MVT VT = Op.getValueType().getSimpleVT(); + + bool IllegalFPCMov = false; + if (VT.isFloatingPoint() && !VT.isVector() && + !isScalarFPTypeInSSEReg(VT)) // FPStack? + IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); + + if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || + Opc == X86ISD::BT) { // FIXME + Cond = Cmp; + addTest = false; + } + } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || + CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || + ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && + Cond.getOperand(0).getValueType() != MVT::i8)) { + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + unsigned X86Opcode; + unsigned X86Cond; + SDVTList VTs; + switch (CondOpcode) { + case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; + case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; + case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; + case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; + case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; + case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; + default: llvm_unreachable("unexpected overflowing operator"); + } + if (CondOpcode == ISD::UMULO) + VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), + MVT::i32); + else + VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); + + SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS); + + if (CondOpcode == ISD::UMULO) + Cond = X86Op.getValue(2); + else + Cond = X86Op.getValue(1); + + CC = DAG.getConstant(X86Cond, MVT::i8); + addTest = false; + } + + if (addTest) { + // Look pass the truncate if the high bits are known zero. + if (isTruncWithZeroHighBitsInput(Cond, DAG)) + Cond = Cond.getOperand(0); + + // We know the result of AND is compared against zero. Try to match + // it to BT. + if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { + SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); + if (NewSetCC.getNode()) { + CC = NewSetCC.getOperand(0); + Cond = NewSetCC.getOperand(1); + addTest = false; + } + } + } + + if (addTest) { + CC = DAG.getConstant(X86::COND_NE, MVT::i8); + Cond = EmitTest(Cond, X86::COND_NE, DAG); + } + + // a < b ? -1 : 0 -> RES = ~setcc_carry + // a < b ? 0 : -1 -> RES = setcc_carry + // a >= b ? -1 : 0 -> RES = setcc_carry + // a >= b ? 0 : -1 -> RES = ~setcc_carry + if (Cond.getOpcode() == X86ISD::SUB) { + Cond = ConvertCmpIfNecessary(Cond, DAG); + unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); + + if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && + (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { + SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), + DAG.getConstant(X86::COND_B, MVT::i8), Cond); + if (isAllOnes(Op1) != (CondCode == X86::COND_B)) + return DAG.getNOT(DL, Res, Res.getValueType()); + return Res; + } + } + + // X86 doesn't have an i8 cmov. If both operands are the result of a truncate + // widen the cmov and push the truncate through. This avoids introducing a new + // branch during isel and doesn't add any extensions. + if (Op.getValueType() == MVT::i8 && + Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) { + SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0); + if (T1.getValueType() == T2.getValueType() && + // Blacklist CopyFromReg to avoid partial register stalls. + T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){ + SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue); + SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond); + return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); + } + } + + // X86ISD::CMOV means set the result (which is operand 1) to the RHS if + // condition is true. + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); + SDValue Ops[] = { Op2, Op1, CC, Cond }; + return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); +} + +SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op, + SelectionDAG &DAG) const { + MVT VT = Op->getValueType(0).getSimpleVT(); + SDValue In = Op->getOperand(0); + MVT InVT = In.getValueType().getSimpleVT(); + DebugLoc dl = Op->getDebugLoc(); + + if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && + (VT != MVT::v8i32 || InVT != MVT::v8i16)) + return SDValue(); + + if (Subtarget->hasInt256()) + return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, In); + + // Optimize vectors in AVX mode + // Sign extend v8i16 to v8i32 and + // v4i32 to v4i64 + // + // Divide input vector into two parts + // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} + // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 + // concat the vectors to original VT + + unsigned NumElems = InVT.getVectorNumElements(); + SDValue Undef = DAG.getUNDEF(InVT); + + SmallVector<int,8> ShufMask1(NumElems, -1); + for (unsigned i = 0; i != NumElems/2; ++i) + ShufMask1[i] = i; + + SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]); + + SmallVector<int,8> ShufMask2(NumElems, -1); + for (unsigned i = 0; i != NumElems/2; ++i) + ShufMask2[i] = i + NumElems/2; + + SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]); + + MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), + VT.getVectorNumElements()/2); + + OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); + OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); +} + +// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or +// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart +// from the AND / OR. +static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { + Opc = Op.getOpcode(); + if (Opc != ISD::OR && Opc != ISD::AND) + return false; + return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && + Op.getOperand(0).hasOneUse() && + Op.getOperand(1).getOpcode() == X86ISD::SETCC && + Op.getOperand(1).hasOneUse()); +} + +// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and +// 1 and that the SETCC node has a single use. +static bool isXor1OfSetCC(SDValue Op) { + if (Op.getOpcode() != ISD::XOR) + return false; + ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + if (N1C && N1C->getAPIntValue() == 1) { + return Op.getOperand(0).getOpcode() == X86ISD::SETCC && + Op.getOperand(0).hasOneUse(); + } + return false; +} + +SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { + bool addTest = true; + SDValue Chain = Op.getOperand(0); + SDValue Cond = Op.getOperand(1); + SDValue Dest = Op.getOperand(2); + DebugLoc dl = Op.getDebugLoc(); + SDValue CC; + bool Inverted = false; + + if (Cond.getOpcode() == ISD::SETCC) { + // Check for setcc([su]{add,sub,mul}o == 0). + if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ && + isa<ConstantSDNode>(Cond.getOperand(1)) && + cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() && + Cond.getOperand(0).getResNo() == 1 && + (Cond.getOperand(0).getOpcode() == ISD::SADDO || + Cond.getOperand(0).getOpcode() == ISD::UADDO || + Cond.getOperand(0).getOpcode() == ISD::SSUBO || + Cond.getOperand(0).getOpcode() == ISD::USUBO || + Cond.getOperand(0).getOpcode() == ISD::SMULO || + Cond.getOperand(0).getOpcode() == ISD::UMULO)) { + Inverted = true; + Cond = Cond.getOperand(0); + } else { + SDValue NewCond = LowerSETCC(Cond, DAG); + if (NewCond.getNode()) + Cond = NewCond; + } + } +#if 0 + // FIXME: LowerXALUO doesn't handle these!! + else if (Cond.getOpcode() == X86ISD::ADD || + Cond.getOpcode() == X86ISD::SUB || + Cond.getOpcode() == X86ISD::SMUL || + Cond.getOpcode() == X86ISD::UMUL) + Cond = LowerXALUO(Cond, DAG); +#endif + + // Look pass (and (setcc_carry (cmp ...)), 1). + if (Cond.getOpcode() == ISD::AND && + Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); + if (C && C->getAPIntValue() == 1) + Cond = Cond.getOperand(0); + } + + // If condition flag is set by a X86ISD::CMP, then use it as the condition + // setting operand in place of the X86ISD::SETCC. + unsigned CondOpcode = Cond.getOpcode(); + if (CondOpcode == X86ISD::SETCC || + CondOpcode == X86ISD::SETCC_CARRY) { + CC = Cond.getOperand(0); + + SDValue Cmp = Cond.getOperand(1); + unsigned Opc = Cmp.getOpcode(); + // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? + if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { + Cond = Cmp; + addTest = false; + } else { + switch (cast<ConstantSDNode>(CC)->getZExtValue()) { + default: break; + case X86::COND_O: + case X86::COND_B: + // These can only come from an arithmetic instruction with overflow, + // e.g. SADDO, UADDO. + Cond = Cond.getNode()->getOperand(1); + addTest = false; + break; + } + } + } + CondOpcode = Cond.getOpcode(); + if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || + CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || + ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && + Cond.getOperand(0).getValueType() != MVT::i8)) { + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + unsigned X86Opcode; + unsigned X86Cond; + SDVTList VTs; + switch (CondOpcode) { + case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; + case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; + case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; + case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; + case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; + case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; + default: llvm_unreachable("unexpected overflowing operator"); + } + if (Inverted) + X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond); + if (CondOpcode == ISD::UMULO) + VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), + MVT::i32); + else + VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); + + SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS); + + if (CondOpcode == ISD::UMULO) + Cond = X86Op.getValue(2); + else + Cond = X86Op.getValue(1); + + CC = DAG.getConstant(X86Cond, MVT::i8); + addTest = false; + } else { + unsigned CondOpc; + if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { + SDValue Cmp = Cond.getOperand(0).getOperand(1); + if (CondOpc == ISD::OR) { + // Also, recognize the pattern generated by an FCMP_UNE. We can emit + // two branches instead of an explicit OR instruction with a + // separate test. + if (Cmp == Cond.getOperand(1).getOperand(1) && + isX86LogicalCmp(Cmp)) { + CC = Cond.getOperand(0).getOperand(0); + Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), + Chain, Dest, CC, Cmp); + CC = Cond.getOperand(1).getOperand(0); + Cond = Cmp; + addTest = false; + } + } else { // ISD::AND + // Also, recognize the pattern generated by an FCMP_OEQ. We can emit + // two branches instead of an explicit AND instruction with a + // separate test. However, we only do this if this block doesn't + // have a fall-through edge, because this requires an explicit + // jmp when the condition is false. + if (Cmp == Cond.getOperand(1).getOperand(1) && + isX86LogicalCmp(Cmp) && + Op.getNode()->hasOneUse()) { + X86::CondCode CCode = + (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); + CCode = X86::GetOppositeBranchCondition(CCode); + CC = DAG.getConstant(CCode, MVT::i8); + SDNode *User = *Op.getNode()->use_begin(); + // Look for an unconditional branch following this conditional branch. + // We need this because we need to reverse the successors in order + // to implement FCMP_OEQ. + if (User->getOpcode() == ISD::BR) { + SDValue FalseBB = User->getOperand(1); + SDNode *NewBR = + DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); + assert(NewBR == User); + (void)NewBR; + Dest = FalseBB; + + Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), + Chain, Dest, CC, Cmp); + X86::CondCode CCode = + (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); + CCode = X86::GetOppositeBranchCondition(CCode); + CC = DAG.getConstant(CCode, MVT::i8); + Cond = Cmp; + addTest = false; + } + } + } + } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { + // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. + // It should be transformed during dag combiner except when the condition + // is set by a arithmetics with overflow node. + X86::CondCode CCode = + (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); + CCode = X86::GetOppositeBranchCondition(CCode); + CC = DAG.getConstant(CCode, MVT::i8); + Cond = Cond.getOperand(0).getOperand(1); + addTest = false; + } else if (Cond.getOpcode() == ISD::SETCC && + cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) { + // For FCMP_OEQ, we can emit + // two branches instead of an explicit AND instruction with a + // separate test. However, we only do this if this block doesn't + // have a fall-through edge, because this requires an explicit + // jmp when the condition is false. + if (Op.getNode()->hasOneUse()) { + SDNode *User = *Op.getNode()->use_begin(); + // Look for an unconditional branch following this conditional branch. + // We need this because we need to reverse the successors in order + // to implement FCMP_OEQ. + if (User->getOpcode() == ISD::BR) { + SDValue FalseBB = User->getOperand(1); + SDNode *NewBR = + DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); + assert(NewBR == User); + (void)NewBR; + Dest = FalseBB; + + SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, + Cond.getOperand(0), Cond.getOperand(1)); + Cmp = ConvertCmpIfNecessary(Cmp, DAG); + CC = DAG.getConstant(X86::COND_NE, MVT::i8); + Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), + Chain, Dest, CC, Cmp); + CC = DAG.getConstant(X86::COND_P, MVT::i8); + Cond = Cmp; + addTest = false; + } + } + } else if (Cond.getOpcode() == ISD::SETCC && + cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) { + // For FCMP_UNE, we can emit + // two branches instead of an explicit AND instruction with a + // separate test. However, we only do this if this block doesn't + // have a fall-through edge, because this requires an explicit + // jmp when the condition is false. + if (Op.getNode()->hasOneUse()) { + SDNode *User = *Op.getNode()->use_begin(); + // Look for an unconditional branch following this conditional branch. + // We need this because we need to reverse the successors in order + // to implement FCMP_UNE. + if (User->getOpcode() == ISD::BR) { + SDValue FalseBB = User->getOperand(1); + SDNode *NewBR = + DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); + assert(NewBR == User); + (void)NewBR; + + SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, + Cond.getOperand(0), Cond.getOperand(1)); + Cmp = ConvertCmpIfNecessary(Cmp, DAG); + CC = DAG.getConstant(X86::COND_NE, MVT::i8); + Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), + Chain, Dest, CC, Cmp); + CC = DAG.getConstant(X86::COND_NP, MVT::i8); + Cond = Cmp; + addTest = false; + Dest = FalseBB; + } + } + } + } + + if (addTest) { + // Look pass the truncate if the high bits are known zero. + if (isTruncWithZeroHighBitsInput(Cond, DAG)) + Cond = Cond.getOperand(0); + + // We know the result of AND is compared against zero. Try to match + // it to BT. + if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { + SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); + if (NewSetCC.getNode()) { + CC = NewSetCC.getOperand(0); + Cond = NewSetCC.getOperand(1); + addTest = false; + } + } + } + + if (addTest) { + CC = DAG.getConstant(X86::COND_NE, MVT::i8); + Cond = EmitTest(Cond, X86::COND_NE, DAG); + } + Cond = ConvertCmpIfNecessary(Cond, DAG); + return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), + Chain, Dest, CC, Cond); +} + +// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. +// Calls to _alloca is needed to probe the stack when allocating more than 4k +// bytes in one go. Touching the stack at 4K increments is necessary to ensure +// that the guard pages used by the OS virtual memory manager are allocated in +// correct sequence. +SDValue +X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { + assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() || + getTargetMachine().Options.EnableSegmentedStacks) && + "This should be used only on Windows targets or when segmented stacks " + "are being used"); + assert(!Subtarget->isTargetEnvMacho() && "Not implemented"); + DebugLoc dl = Op.getDebugLoc(); + + // Get the inputs. + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + // FIXME: Ensure alignment here + + bool Is64Bit = Subtarget->is64Bit(); + EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; + + if (getTargetMachine().Options.EnableSegmentedStacks) { + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + if (Is64Bit) { + // The 64 bit implementation of segmented stacks needs to clobber both r10 + // r11. This makes it impossible to use it along with nested parameters. + const Function *F = MF.getFunction(); + + for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); + I != E; ++I) + if (I->hasNestAttr()) + report_fatal_error("Cannot use segmented stacks with functions that " + "have nested arguments."); + } + + const TargetRegisterClass *AddrRegClass = + getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32); + unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); + Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); + SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, + DAG.getRegister(Vreg, SPTy)); + SDValue Ops1[2] = { Value, Chain }; + return DAG.getMergeValues(Ops1, 2, dl); + } else { + SDValue Flag; + unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); + + Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); + Flag = Chain.getValue(1); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + + Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); + Flag = Chain.getValue(1); + + Chain = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), + SPTy).getValue(1); + + SDValue Ops1[2] = { Chain.getValue(0), Chain }; + return DAG.getMergeValues(Ops1, 2, dl); + } +} + +SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + + const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); + DebugLoc DL = Op.getDebugLoc(); + + if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { + // vastart just stores the address of the VarArgsFrameIndex slot into the + // memory location argument. + SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), + getPointerTy()); + return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), + MachinePointerInfo(SV), false, false, 0); + } + + // __va_list_tag: + // gp_offset (0 - 6 * 8) + // fp_offset (48 - 48 + 8 * 16) + // overflow_arg_area (point to parameters coming in memory). + // reg_save_area + SmallVector<SDValue, 8> MemOps; + SDValue FIN = Op.getOperand(1); + // Store gp_offset + SDValue Store = DAG.getStore(Op.getOperand(0), DL, + DAG.getConstant(FuncInfo->getVarArgsGPOffset(), + MVT::i32), + FIN, MachinePointerInfo(SV), false, false, 0); + MemOps.push_back(Store); + + // Store fp_offset + FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), + FIN, DAG.getIntPtrConstant(4)); + Store = DAG.getStore(Op.getOperand(0), DL, + DAG.getConstant(FuncInfo->getVarArgsFPOffset(), + MVT::i32), + FIN, MachinePointerInfo(SV, 4), false, false, 0); + MemOps.push_back(Store); + + // Store ptr to overflow_arg_area + FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), + FIN, DAG.getIntPtrConstant(4)); + SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), + getPointerTy()); + Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, + MachinePointerInfo(SV, 8), + false, false, 0); + MemOps.push_back(Store); + + // Store ptr to reg_save_area. + FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), + FIN, DAG.getIntPtrConstant(8)); + SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), + getPointerTy()); + Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, + MachinePointerInfo(SV, 16), false, false, 0); + MemOps.push_back(Store); + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + &MemOps[0], MemOps.size()); +} + +SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { + assert(Subtarget->is64Bit() && + "LowerVAARG only handles 64-bit va_arg!"); + assert((Subtarget->isTargetLinux() || + Subtarget->isTargetDarwin()) && + "Unhandled target in LowerVAARG"); + assert(Op.getNode()->getNumOperands() == 4); + SDValue Chain = Op.getOperand(0); + SDValue SrcPtr = Op.getOperand(1); + const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); + unsigned Align = Op.getConstantOperandVal(3); + DebugLoc dl = Op.getDebugLoc(); + + EVT ArgVT = Op.getNode()->getValueType(0); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy); + uint8_t ArgMode; + + // Decide which area this value should be read from. + // TODO: Implement the AMD64 ABI in its entirety. This simple + // selection mechanism works only for the basic types. + if (ArgVT == MVT::f80) { + llvm_unreachable("va_arg for f80 not yet implemented"); + } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { + ArgMode = 2; // Argument passed in XMM register. Use fp_offset. + } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { + ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. + } else { + llvm_unreachable("Unhandled argument type in LowerVAARG"); + } + + if (ArgMode == 2) { + // Sanity Check: Make sure using fp_offset makes sense. + assert(!getTargetMachine().Options.UseSoftFloat && + !(DAG.getMachineFunction() + .getFunction()->getAttributes() + .hasAttribute(AttributeSet::FunctionIndex, + Attribute::NoImplicitFloat)) && + Subtarget->hasSSE1()); + } + + // Insert VAARG_64 node into the DAG + // VAARG_64 returns two values: Variable Argument Address, Chain + SmallVector<SDValue, 11> InstOps; + InstOps.push_back(Chain); + InstOps.push_back(SrcPtr); + InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); + InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); + InstOps.push_back(DAG.getConstant(Align, MVT::i32)); + SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); + SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, + VTs, &InstOps[0], InstOps.size(), + MVT::i64, + MachinePointerInfo(SV), + /*Align=*/0, + /*Volatile=*/false, + /*ReadMem=*/true, + /*WriteMem=*/true); + Chain = VAARG.getValue(1); + + // Load the next argument and return it + return DAG.getLoad(ArgVT, dl, + Chain, + VAARG, + MachinePointerInfo(), + false, false, false, 0); +} + +static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + // X86-64 va_list is a struct { i32, i32, i8*, i8* }. + assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); + SDValue Chain = Op.getOperand(0); + SDValue DstPtr = Op.getOperand(1); + SDValue SrcPtr = Op.getOperand(2); + const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); + const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); + DebugLoc DL = Op.getDebugLoc(); + + return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, + DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, + false, + MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); +} + +// getTargetVShiftNode - Handle vector element shifts where the shift amount +// may or may not be a constant. Takes immediate version of shift as input. +static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT, + SDValue SrcOp, SDValue ShAmt, + SelectionDAG &DAG) { + assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32"); + + if (isa<ConstantSDNode>(ShAmt)) { + // Constant may be a TargetConstant. Use a regular constant. + uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue(); + switch (Opc) { + default: llvm_unreachable("Unknown target vector shift node"); + case X86ISD::VSHLI: + case X86ISD::VSRLI: + case X86ISD::VSRAI: + return DAG.getNode(Opc, dl, VT, SrcOp, + DAG.getConstant(ShiftAmt, MVT::i32)); + } + } + + // Change opcode to non-immediate version + switch (Opc) { + default: llvm_unreachable("Unknown target vector shift node"); + case X86ISD::VSHLI: Opc = X86ISD::VSHL; break; + case X86ISD::VSRLI: Opc = X86ISD::VSRL; break; + case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; + } + + // Need to build a vector containing shift amount + // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0 + SDValue ShOps[4]; + ShOps[0] = ShAmt; + ShOps[1] = DAG.getConstant(0, MVT::i32); + ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32); + ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4); + + // The return type has to be a 128-bit type with the same element + // type as the input type. + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); + + ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt); + return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); +} + +static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { + DebugLoc dl = Op.getDebugLoc(); + unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + switch (IntNo) { + default: return SDValue(); // Don't custom lower most intrinsics. + // Comparison intrinsics. + case Intrinsic::x86_sse_comieq_ss: + case Intrinsic::x86_sse_comilt_ss: + case Intrinsic::x86_sse_comile_ss: + case Intrinsic::x86_sse_comigt_ss: + case Intrinsic::x86_sse_comige_ss: + case Intrinsic::x86_sse_comineq_ss: + case Intrinsic::x86_sse_ucomieq_ss: + case Intrinsic::x86_sse_ucomilt_ss: + case Intrinsic::x86_sse_ucomile_ss: + case Intrinsic::x86_sse_ucomigt_ss: + case Intrinsic::x86_sse_ucomige_ss: + case Intrinsic::x86_sse_ucomineq_ss: + case Intrinsic::x86_sse2_comieq_sd: + case Intrinsic::x86_sse2_comilt_sd: + case Intrinsic::x86_sse2_comile_sd: + case Intrinsic::x86_sse2_comigt_sd: + case Intrinsic::x86_sse2_comige_sd: + case Intrinsic::x86_sse2_comineq_sd: + case Intrinsic::x86_sse2_ucomieq_sd: + case Intrinsic::x86_sse2_ucomilt_sd: + case Intrinsic::x86_sse2_ucomile_sd: + case Intrinsic::x86_sse2_ucomigt_sd: + case Intrinsic::x86_sse2_ucomige_sd: + case Intrinsic::x86_sse2_ucomineq_sd: { + unsigned Opc; + ISD::CondCode CC; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_sse_comieq_ss: + case Intrinsic::x86_sse2_comieq_sd: + Opc = X86ISD::COMI; + CC = ISD::SETEQ; + break; + case Intrinsic::x86_sse_comilt_ss: + case Intrinsic::x86_sse2_comilt_sd: + Opc = X86ISD::COMI; + CC = ISD::SETLT; + break; + case Intrinsic::x86_sse_comile_ss: + case Intrinsic::x86_sse2_comile_sd: + Opc = X86ISD::COMI; + CC = ISD::SETLE; + break; + case Intrinsic::x86_sse_comigt_ss: + case Intrinsic::x86_sse2_comigt_sd: + Opc = X86ISD::COMI; + CC = ISD::SETGT; + break; + case Intrinsic::x86_sse_comige_ss: + case Intrinsic::x86_sse2_comige_sd: + Opc = X86ISD::COMI; + CC = ISD::SETGE; + break; + case Intrinsic::x86_sse_comineq_ss: + case Intrinsic::x86_sse2_comineq_sd: + Opc = X86ISD::COMI; + CC = ISD::SETNE; + break; + case Intrinsic::x86_sse_ucomieq_ss: + case Intrinsic::x86_sse2_ucomieq_sd: + Opc = X86ISD::UCOMI; + CC = ISD::SETEQ; + break; + case Intrinsic::x86_sse_ucomilt_ss: + case Intrinsic::x86_sse2_ucomilt_sd: + Opc = X86ISD::UCOMI; + CC = ISD::SETLT; + break; + case Intrinsic::x86_sse_ucomile_ss: + case Intrinsic::x86_sse2_ucomile_sd: + Opc = X86ISD::UCOMI; + CC = ISD::SETLE; + break; + case Intrinsic::x86_sse_ucomigt_ss: + case Intrinsic::x86_sse2_ucomigt_sd: + Opc = X86ISD::UCOMI; + CC = ISD::SETGT; + break; + case Intrinsic::x86_sse_ucomige_ss: + case Intrinsic::x86_sse2_ucomige_sd: + Opc = X86ISD::UCOMI; + CC = ISD::SETGE; + break; + case Intrinsic::x86_sse_ucomineq_ss: + case Intrinsic::x86_sse2_ucomineq_sd: + Opc = X86ISD::UCOMI; + CC = ISD::SETNE; + break; + } + + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); + assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); + SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86CC, MVT::i8), Cond); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + } + + // Arithmetic intrinsics. + case Intrinsic::x86_sse2_pmulu_dq: + case Intrinsic::x86_avx2_pmulu_dq: + return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + + // SSE2/AVX2 sub with unsigned saturation intrinsics + case Intrinsic::x86_sse2_psubus_b: + case Intrinsic::x86_sse2_psubus_w: + case Intrinsic::x86_avx2_psubus_b: + case Intrinsic::x86_avx2_psubus_w: + return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + + // SSE3/AVX horizontal add/sub intrinsics + case Intrinsic::x86_sse3_hadd_ps: + case Intrinsic::x86_sse3_hadd_pd: + case Intrinsic::x86_avx_hadd_ps_256: + case Intrinsic::x86_avx_hadd_pd_256: + case Intrinsic::x86_sse3_hsub_ps: + case Intrinsic::x86_sse3_hsub_pd: + case Intrinsic::x86_avx_hsub_ps_256: + case Intrinsic::x86_avx_hsub_pd_256: + case Intrinsic::x86_ssse3_phadd_w_128: + case Intrinsic::x86_ssse3_phadd_d_128: + case Intrinsic::x86_avx2_phadd_w: + case Intrinsic::x86_avx2_phadd_d: + case Intrinsic::x86_ssse3_phsub_w_128: + case Intrinsic::x86_ssse3_phsub_d_128: + case Intrinsic::x86_avx2_phsub_w: + case Intrinsic::x86_avx2_phsub_d: { + unsigned Opcode; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_sse3_hadd_ps: + case Intrinsic::x86_sse3_hadd_pd: + case Intrinsic::x86_avx_hadd_ps_256: + case Intrinsic::x86_avx_hadd_pd_256: + Opcode = X86ISD::FHADD; + break; + case Intrinsic::x86_sse3_hsub_ps: + case Intrinsic::x86_sse3_hsub_pd: + case Intrinsic::x86_avx_hsub_ps_256: + case Intrinsic::x86_avx_hsub_pd_256: + Opcode = X86ISD::FHSUB; + break; + case Intrinsic::x86_ssse3_phadd_w_128: + case Intrinsic::x86_ssse3_phadd_d_128: + case Intrinsic::x86_avx2_phadd_w: + case Intrinsic::x86_avx2_phadd_d: + Opcode = X86ISD::HADD; + break; + case Intrinsic::x86_ssse3_phsub_w_128: + case Intrinsic::x86_ssse3_phsub_d_128: + case Intrinsic::x86_avx2_phsub_w: + case Intrinsic::x86_avx2_phsub_d: + Opcode = X86ISD::HSUB; + break; + } + return DAG.getNode(Opcode, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + + // SSE2/SSE41/AVX2 integer max/min intrinsics. + case Intrinsic::x86_sse2_pmaxu_b: + case Intrinsic::x86_sse41_pmaxuw: + case Intrinsic::x86_sse41_pmaxud: + case Intrinsic::x86_avx2_pmaxu_b: + case Intrinsic::x86_avx2_pmaxu_w: + case Intrinsic::x86_avx2_pmaxu_d: + case Intrinsic::x86_sse2_pminu_b: + case Intrinsic::x86_sse41_pminuw: + case Intrinsic::x86_sse41_pminud: + case Intrinsic::x86_avx2_pminu_b: + case Intrinsic::x86_avx2_pminu_w: + case Intrinsic::x86_avx2_pminu_d: + case Intrinsic::x86_sse41_pmaxsb: + case Intrinsic::x86_sse2_pmaxs_w: + case Intrinsic::x86_sse41_pmaxsd: + case Intrinsic::x86_avx2_pmaxs_b: + case Intrinsic::x86_avx2_pmaxs_w: + case Intrinsic::x86_avx2_pmaxs_d: + case Intrinsic::x86_sse41_pminsb: + case Intrinsic::x86_sse2_pmins_w: + case Intrinsic::x86_sse41_pminsd: + case Intrinsic::x86_avx2_pmins_b: + case Intrinsic::x86_avx2_pmins_w: + case Intrinsic::x86_avx2_pmins_d: { + unsigned Opcode; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_sse2_pmaxu_b: + case Intrinsic::x86_sse41_pmaxuw: + case Intrinsic::x86_sse41_pmaxud: + case Intrinsic::x86_avx2_pmaxu_b: + case Intrinsic::x86_avx2_pmaxu_w: + case Intrinsic::x86_avx2_pmaxu_d: + Opcode = X86ISD::UMAX; + break; + case Intrinsic::x86_sse2_pminu_b: + case Intrinsic::x86_sse41_pminuw: + case Intrinsic::x86_sse41_pminud: + case Intrinsic::x86_avx2_pminu_b: + case Intrinsic::x86_avx2_pminu_w: + case Intrinsic::x86_avx2_pminu_d: + Opcode = X86ISD::UMIN; + break; + case Intrinsic::x86_sse41_pmaxsb: + case Intrinsic::x86_sse2_pmaxs_w: + case Intrinsic::x86_sse41_pmaxsd: + case Intrinsic::x86_avx2_pmaxs_b: + case Intrinsic::x86_avx2_pmaxs_w: + case Intrinsic::x86_avx2_pmaxs_d: + Opcode = X86ISD::SMAX; + break; + case Intrinsic::x86_sse41_pminsb: + case Intrinsic::x86_sse2_pmins_w: + case Intrinsic::x86_sse41_pminsd: + case Intrinsic::x86_avx2_pmins_b: + case Intrinsic::x86_avx2_pmins_w: + case Intrinsic::x86_avx2_pmins_d: + Opcode = X86ISD::SMIN; + break; + } + return DAG.getNode(Opcode, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + + // SSE/SSE2/AVX floating point max/min intrinsics. + case Intrinsic::x86_sse_max_ps: + case Intrinsic::x86_sse2_max_pd: + case Intrinsic::x86_avx_max_ps_256: + case Intrinsic::x86_avx_max_pd_256: + case Intrinsic::x86_sse_min_ps: + case Intrinsic::x86_sse2_min_pd: + case Intrinsic::x86_avx_min_ps_256: + case Intrinsic::x86_avx_min_pd_256: { + unsigned Opcode; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_sse_max_ps: + case Intrinsic::x86_sse2_max_pd: + case Intrinsic::x86_avx_max_ps_256: + case Intrinsic::x86_avx_max_pd_256: + Opcode = X86ISD::FMAX; + break; + case Intrinsic::x86_sse_min_ps: + case Intrinsic::x86_sse2_min_pd: + case Intrinsic::x86_avx_min_ps_256: + case Intrinsic::x86_avx_min_pd_256: + Opcode = X86ISD::FMIN; + break; + } + return DAG.getNode(Opcode, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + + // AVX2 variable shift intrinsics + case Intrinsic::x86_avx2_psllv_d: + case Intrinsic::x86_avx2_psllv_q: + case Intrinsic::x86_avx2_psllv_d_256: + case Intrinsic::x86_avx2_psllv_q_256: + case Intrinsic::x86_avx2_psrlv_d: + case Intrinsic::x86_avx2_psrlv_q: + case Intrinsic::x86_avx2_psrlv_d_256: + case Intrinsic::x86_avx2_psrlv_q_256: + case Intrinsic::x86_avx2_psrav_d: + case Intrinsic::x86_avx2_psrav_d_256: { + unsigned Opcode; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_avx2_psllv_d: + case Intrinsic::x86_avx2_psllv_q: + case Intrinsic::x86_avx2_psllv_d_256: + case Intrinsic::x86_avx2_psllv_q_256: + Opcode = ISD::SHL; + break; + case Intrinsic::x86_avx2_psrlv_d: + case Intrinsic::x86_avx2_psrlv_q: + case Intrinsic::x86_avx2_psrlv_d_256: + case Intrinsic::x86_avx2_psrlv_q_256: + Opcode = ISD::SRL; + break; + case Intrinsic::x86_avx2_psrav_d: + case Intrinsic::x86_avx2_psrav_d_256: + Opcode = ISD::SRA; + break; + } + return DAG.getNode(Opcode, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + + case Intrinsic::x86_ssse3_pshuf_b_128: + case Intrinsic::x86_avx2_pshuf_b: + return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::x86_ssse3_psign_b_128: + case Intrinsic::x86_ssse3_psign_w_128: + case Intrinsic::x86_ssse3_psign_d_128: + case Intrinsic::x86_avx2_psign_b: + case Intrinsic::x86_avx2_psign_w: + case Intrinsic::x86_avx2_psign_d: + return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::x86_sse41_insertps: + return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::x86_avx_vperm2f128_ps_256: + case Intrinsic::x86_avx_vperm2f128_pd_256: + case Intrinsic::x86_avx_vperm2f128_si_256: + case Intrinsic::x86_avx2_vperm2i128: + return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::x86_avx2_permd: + case Intrinsic::x86_avx2_permps: + // Operands intentionally swapped. Mask is last operand to intrinsic, + // but second operand for node/intruction. + return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(1)); + + case Intrinsic::x86_sse_sqrt_ps: + case Intrinsic::x86_sse2_sqrt_pd: + case Intrinsic::x86_avx_sqrt_ps_256: + case Intrinsic::x86_avx_sqrt_pd_256: + return DAG.getNode(ISD::FSQRT, dl, Op.getValueType(), Op.getOperand(1)); + + // ptest and testp intrinsics. The intrinsic these come from are designed to + // return an integer value, not just an instruction so lower it to the ptest + // or testp pattern and a setcc for the result. + case Intrinsic::x86_sse41_ptestz: + case Intrinsic::x86_sse41_ptestc: + case Intrinsic::x86_sse41_ptestnzc: + case Intrinsic::x86_avx_ptestz_256: + case Intrinsic::x86_avx_ptestc_256: + case Intrinsic::x86_avx_ptestnzc_256: + case Intrinsic::x86_avx_vtestz_ps: + case Intrinsic::x86_avx_vtestc_ps: + case Intrinsic::x86_avx_vtestnzc_ps: + case Intrinsic::x86_avx_vtestz_pd: + case Intrinsic::x86_avx_vtestc_pd: + case Intrinsic::x86_avx_vtestnzc_pd: + case Intrinsic::x86_avx_vtestz_ps_256: + case Intrinsic::x86_avx_vtestc_ps_256: + case Intrinsic::x86_avx_vtestnzc_ps_256: + case Intrinsic::x86_avx_vtestz_pd_256: + case Intrinsic::x86_avx_vtestc_pd_256: + case Intrinsic::x86_avx_vtestnzc_pd_256: { + bool IsTestPacked = false; + unsigned X86CC; + switch (IntNo) { + default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); + case Intrinsic::x86_avx_vtestz_ps: + case Intrinsic::x86_avx_vtestz_pd: + case Intrinsic::x86_avx_vtestz_ps_256: + case Intrinsic::x86_avx_vtestz_pd_256: + IsTestPacked = true; // Fallthrough + case Intrinsic::x86_sse41_ptestz: + case Intrinsic::x86_avx_ptestz_256: + // ZF = 1 + X86CC = X86::COND_E; + break; + case Intrinsic::x86_avx_vtestc_ps: + case Intrinsic::x86_avx_vtestc_pd: + case Intrinsic::x86_avx_vtestc_ps_256: + case Intrinsic::x86_avx_vtestc_pd_256: + IsTestPacked = true; // Fallthrough + case Intrinsic::x86_sse41_ptestc: + case Intrinsic::x86_avx_ptestc_256: + // CF = 1 + X86CC = X86::COND_B; + break; + case Intrinsic::x86_avx_vtestnzc_ps: + case Intrinsic::x86_avx_vtestnzc_pd: + case Intrinsic::x86_avx_vtestnzc_ps_256: + case Intrinsic::x86_avx_vtestnzc_pd_256: + IsTestPacked = true; // Fallthrough + case Intrinsic::x86_sse41_ptestnzc: + case Intrinsic::x86_avx_ptestnzc_256: + // ZF and CF = 0 + X86CC = X86::COND_A; + break; + } + + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; + SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); + SDValue CC = DAG.getConstant(X86CC, MVT::i8); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + } + + // SSE/AVX shift intrinsics + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx2_psra_d: { + unsigned Opcode; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + Opcode = X86ISD::VSHL; + break; + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + Opcode = X86ISD::VSRL; + break; + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx2_psra_d: + Opcode = X86ISD::VSRA; + break; + } + return DAG.getNode(Opcode, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + + // SSE/AVX immediate shift intrinsics + case Intrinsic::x86_sse2_pslli_w: + case Intrinsic::x86_sse2_pslli_d: + case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx2_psrai_d: { + unsigned Opcode; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_sse2_pslli_w: + case Intrinsic::x86_sse2_pslli_d: + case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + Opcode = X86ISD::VSHLI; + break; + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + Opcode = X86ISD::VSRLI; + break; + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx2_psrai_d: + Opcode = X86ISD::VSRAI; + break; + } + return getTargetVShiftNode(Opcode, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), DAG); + } + + case Intrinsic::x86_sse42_pcmpistria128: + case Intrinsic::x86_sse42_pcmpestria128: + case Intrinsic::x86_sse42_pcmpistric128: + case Intrinsic::x86_sse42_pcmpestric128: + case Intrinsic::x86_sse42_pcmpistrio128: + case Intrinsic::x86_sse42_pcmpestrio128: + case Intrinsic::x86_sse42_pcmpistris128: + case Intrinsic::x86_sse42_pcmpestris128: + case Intrinsic::x86_sse42_pcmpistriz128: + case Intrinsic::x86_sse42_pcmpestriz128: { + unsigned Opcode; + unsigned X86CC; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_sse42_pcmpistria128: + Opcode = X86ISD::PCMPISTRI; + X86CC = X86::COND_A; + break; + case Intrinsic::x86_sse42_pcmpestria128: + Opcode = X86ISD::PCMPESTRI; + X86CC = X86::COND_A; + break; + case Intrinsic::x86_sse42_pcmpistric128: + Opcode = X86ISD::PCMPISTRI; + X86CC = X86::COND_B; + break; + case Intrinsic::x86_sse42_pcmpestric128: + Opcode = X86ISD::PCMPESTRI; + X86CC = X86::COND_B; + break; + case Intrinsic::x86_sse42_pcmpistrio128: + Opcode = X86ISD::PCMPISTRI; + X86CC = X86::COND_O; + break; + case Intrinsic::x86_sse42_pcmpestrio128: + Opcode = X86ISD::PCMPESTRI; + X86CC = X86::COND_O; + break; + case Intrinsic::x86_sse42_pcmpistris128: + Opcode = X86ISD::PCMPISTRI; + X86CC = X86::COND_S; + break; + case Intrinsic::x86_sse42_pcmpestris128: + Opcode = X86ISD::PCMPESTRI; + X86CC = X86::COND_S; + break; + case Intrinsic::x86_sse42_pcmpistriz128: + Opcode = X86ISD::PCMPISTRI; + X86CC = X86::COND_E; + break; + case Intrinsic::x86_sse42_pcmpestriz128: + Opcode = X86ISD::PCMPESTRI; + X86CC = X86::COND_E; + break; + } + SmallVector<SDValue, 5> NewOps; + NewOps.append(Op->op_begin()+1, Op->op_end()); + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); + SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86CC, MVT::i8), + SDValue(PCMP.getNode(), 1)); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + } + + case Intrinsic::x86_sse42_pcmpistri128: + case Intrinsic::x86_sse42_pcmpestri128: { + unsigned Opcode; + if (IntNo == Intrinsic::x86_sse42_pcmpistri128) + Opcode = X86ISD::PCMPISTRI; + else + Opcode = X86ISD::PCMPESTRI; + + SmallVector<SDValue, 5> NewOps; + NewOps.append(Op->op_begin()+1, Op->op_end()); + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); + return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); + } + case Intrinsic::x86_fma_vfmadd_ps: + case Intrinsic::x86_fma_vfmadd_pd: + case Intrinsic::x86_fma_vfmsub_ps: + case Intrinsic::x86_fma_vfmsub_pd: + case Intrinsic::x86_fma_vfnmadd_ps: + case Intrinsic::x86_fma_vfnmadd_pd: + case Intrinsic::x86_fma_vfnmsub_ps: + case Intrinsic::x86_fma_vfnmsub_pd: + case Intrinsic::x86_fma_vfmaddsub_ps: + case Intrinsic::x86_fma_vfmaddsub_pd: + case Intrinsic::x86_fma_vfmsubadd_ps: + case Intrinsic::x86_fma_vfmsubadd_pd: + case Intrinsic::x86_fma_vfmadd_ps_256: + case Intrinsic::x86_fma_vfmadd_pd_256: + case Intrinsic::x86_fma_vfmsub_ps_256: + case Intrinsic::x86_fma_vfmsub_pd_256: + case Intrinsic::x86_fma_vfnmadd_ps_256: + case Intrinsic::x86_fma_vfnmadd_pd_256: + case Intrinsic::x86_fma_vfnmsub_ps_256: + case Intrinsic::x86_fma_vfnmsub_pd_256: + case Intrinsic::x86_fma_vfmaddsub_ps_256: + case Intrinsic::x86_fma_vfmaddsub_pd_256: + case Intrinsic::x86_fma_vfmsubadd_ps_256: + case Intrinsic::x86_fma_vfmsubadd_pd_256: { + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_fma_vfmadd_ps: + case Intrinsic::x86_fma_vfmadd_pd: + case Intrinsic::x86_fma_vfmadd_ps_256: + case Intrinsic::x86_fma_vfmadd_pd_256: + Opc = X86ISD::FMADD; + break; + case Intrinsic::x86_fma_vfmsub_ps: + case Intrinsic::x86_fma_vfmsub_pd: + case Intrinsic::x86_fma_vfmsub_ps_256: + case Intrinsic::x86_fma_vfmsub_pd_256: + Opc = X86ISD::FMSUB; + break; + case Intrinsic::x86_fma_vfnmadd_ps: + case Intrinsic::x86_fma_vfnmadd_pd: + case Intrinsic::x86_fma_vfnmadd_ps_256: + case Intrinsic::x86_fma_vfnmadd_pd_256: + Opc = X86ISD::FNMADD; + break; + case Intrinsic::x86_fma_vfnmsub_ps: + case Intrinsic::x86_fma_vfnmsub_pd: + case Intrinsic::x86_fma_vfnmsub_ps_256: + case Intrinsic::x86_fma_vfnmsub_pd_256: + Opc = X86ISD::FNMSUB; + break; + case Intrinsic::x86_fma_vfmaddsub_ps: + case Intrinsic::x86_fma_vfmaddsub_pd: + case Intrinsic::x86_fma_vfmaddsub_ps_256: + case Intrinsic::x86_fma_vfmaddsub_pd_256: + Opc = X86ISD::FMADDSUB; + break; + case Intrinsic::x86_fma_vfmsubadd_ps: + case Intrinsic::x86_fma_vfmsubadd_pd: + case Intrinsic::x86_fma_vfmsubadd_ps_256: + case Intrinsic::x86_fma_vfmsubadd_pd_256: + Opc = X86ISD::FMSUBADD; + break; + } + + return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); + } + } +} + +static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) { + DebugLoc dl = Op.getDebugLoc(); + unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + switch (IntNo) { + default: return SDValue(); // Don't custom lower most intrinsics. + + // RDRAND/RDSEED intrinsics. + case Intrinsic::x86_rdrand_16: + case Intrinsic::x86_rdrand_32: + case Intrinsic::x86_rdrand_64: + case Intrinsic::x86_rdseed_16: + case Intrinsic::x86_rdseed_32: + case Intrinsic::x86_rdseed_64: { + unsigned Opcode = (IntNo == Intrinsic::x86_rdseed_16 || + IntNo == Intrinsic::x86_rdseed_32 || + IntNo == Intrinsic::x86_rdseed_64) ? X86ISD::RDSEED : + X86ISD::RDRAND; + // Emit the node with the right value type. + SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); + SDValue Result = DAG.getNode(Opcode, dl, VTs, Op.getOperand(0)); + + // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. + // Otherwise return the value from Rand, which is always 0, casted to i32. + SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), + DAG.getConstant(1, Op->getValueType(1)), + DAG.getConstant(X86::COND_B, MVT::i32), + SDValue(Result.getNode(), 1) }; + SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, + DAG.getVTList(Op->getValueType(1), MVT::Glue), + Ops, array_lengthof(Ops)); + + // Return { result, isValid, chain }. + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, + SDValue(Result.getNode(), 2)); + } + + // XTEST intrinsics. + case Intrinsic::x86_xtest: { + SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); + SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0)); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86::COND_NE, MVT::i8), + InTrans); + SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC); + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), + Ret, SDValue(InTrans.getNode(), 1)); + } + } +} + +SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, + SelectionDAG &DAG) const { + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setReturnAddressIsTaken(true); + + unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + DebugLoc dl = Op.getDebugLoc(); + EVT PtrVT = getPointerTy(); + + if (Depth > 0) { + SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); + SDValue Offset = + DAG.getConstant(RegInfo->getSlotSize(), PtrVT); + return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), + DAG.getNode(ISD::ADD, dl, PtrVT, + FrameAddr, Offset), + MachinePointerInfo(), false, false, false, 0); + } + + // Just load the return address. + SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); + return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), + RetAddrFI, MachinePointerInfo(), false, false, false, 0); +} + +SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setFrameAddressIsTaken(true); + + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful + unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); + assert(((FrameReg == X86::RBP && VT == MVT::i64) || + (FrameReg == X86::EBP && VT == MVT::i32)) && + "Invalid Frame Register!"); + SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); + while (Depth--) + FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, + MachinePointerInfo(), + false, false, false, 0); + return FrameAddr; +} + +SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, + SelectionDAG &DAG) const { + return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize()); +} + +SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + SDValue Offset = Op.getOperand(1); + SDValue Handler = Op.getOperand(2); + DebugLoc dl = Op.getDebugLoc(); + + EVT PtrVT = getPointerTy(); + unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); + assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || + (FrameReg == X86::EBP && PtrVT == MVT::i32)) && + "Invalid Frame Register!"); + SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT); + unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX; + + SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame, + DAG.getIntPtrConstant(RegInfo->getSlotSize())); + StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset); + Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), + false, false, 0); + Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); + + return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain, + DAG.getRegister(StoreAddrReg, PtrVT)); +} + +SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL, + DAG.getVTList(MVT::i32, MVT::Other), + Op.getOperand(0), Op.getOperand(1)); +} + +SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, + Op.getOperand(0), Op.getOperand(1)); +} + +static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) { + return Op.getOperand(0); +} + +SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, + SelectionDAG &DAG) const { + SDValue Root = Op.getOperand(0); + SDValue Trmp = Op.getOperand(1); // trampoline + SDValue FPtr = Op.getOperand(2); // nested function + SDValue Nest = Op.getOperand(3); // 'nest' parameter value + DebugLoc dl = Op.getDebugLoc(); + + const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); + const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); + + if (Subtarget->is64Bit()) { + SDValue OutChains[6]; + + // Large code-model. + const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. + const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. + + const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7; + const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7; + + const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix + + // Load the pointer to the nested function into R11. + unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 + SDValue Addr = Trmp; + OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), + Addr, MachinePointerInfo(TrmpAddr), + false, false, 0); + + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(2, MVT::i64)); + OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, + MachinePointerInfo(TrmpAddr, 2), + false, false, 2); + + // Load the 'nest' parameter value into R10. + // R10 is specified in X86CallingConv.td + OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(10, MVT::i64)); + OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), + Addr, MachinePointerInfo(TrmpAddr, 10), + false, false, 0); + + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(12, MVT::i64)); + OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, + MachinePointerInfo(TrmpAddr, 12), + false, false, 2); + + // Jump to the nested function. + OpCode = (JMP64r << 8) | REX_WB; // jmpq *... + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(20, MVT::i64)); + OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), + Addr, MachinePointerInfo(TrmpAddr, 20), + false, false, 0); + + unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(22, MVT::i64)); + OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, + MachinePointerInfo(TrmpAddr, 22), + false, false, 0); + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6); + } else { + const Function *Func = + cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); + CallingConv::ID CC = Func->getCallingConv(); + unsigned NestReg; + + switch (CC) { + default: + llvm_unreachable("Unsupported calling convention"); + case CallingConv::C: + case CallingConv::X86_StdCall: { + // Pass 'nest' parameter in ECX. + // Must be kept in sync with X86CallingConv.td + NestReg = X86::ECX; + + // Check that ECX wasn't needed by an 'inreg' parameter. + FunctionType *FTy = Func->getFunctionType(); + const AttributeSet &Attrs = Func->getAttributes(); + + if (!Attrs.isEmpty() && !Func->isVarArg()) { + unsigned InRegCount = 0; + unsigned Idx = 1; + + for (FunctionType::param_iterator I = FTy->param_begin(), + E = FTy->param_end(); I != E; ++I, ++Idx) + if (Attrs.hasAttribute(Idx, Attribute::InReg)) + // FIXME: should only count parameters that are lowered to integers. + InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; + + if (InRegCount > 2) { + report_fatal_error("Nest register in use - reduce number of inreg" + " parameters!"); + } + } + break; + } + case CallingConv::X86_FastCall: + case CallingConv::X86_ThisCall: + case CallingConv::Fast: + // Pass 'nest' parameter in EAX. + // Must be kept in sync with X86CallingConv.td + NestReg = X86::EAX; + break; + } + + SDValue OutChains[4]; + SDValue Addr, Disp; + + Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, + DAG.getConstant(10, MVT::i32)); + Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); + + // This is storing the opcode for MOV32ri. + const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. + const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7; + OutChains[0] = DAG.getStore(Root, dl, + DAG.getConstant(MOV32ri|N86Reg, MVT::i8), + Trmp, MachinePointerInfo(TrmpAddr), + false, false, 0); + + Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, + DAG.getConstant(1, MVT::i32)); + OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, + MachinePointerInfo(TrmpAddr, 1), + false, false, 1); + + const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. + Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, + DAG.getConstant(5, MVT::i32)); + OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, + MachinePointerInfo(TrmpAddr, 5), + false, false, 1); + + Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, + DAG.getConstant(6, MVT::i32)); + OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, + MachinePointerInfo(TrmpAddr, 6), + false, false, 1); + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4); + } +} + +SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, + SelectionDAG &DAG) const { + /* + The rounding mode is in bits 11:10 of FPSR, and has the following + settings: + 00 Round to nearest + 01 Round to -inf + 10 Round to +inf + 11 Round to 0 + + FLT_ROUNDS, on the other hand, expects the following: + -1 Undefined + 0 Round to 0 + 1 Round to nearest + 2 Round to +inf + 3 Round to -inf + + To perform the conversion, we do: + (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) + */ + + MachineFunction &MF = DAG.getMachineFunction(); + const TargetMachine &TM = MF.getTarget(); + const TargetFrameLowering &TFI = *TM.getFrameLowering(); + unsigned StackAlignment = TFI.getStackAlignment(); + EVT VT = Op.getValueType(); + DebugLoc DL = Op.getDebugLoc(); + + // Save FP Control Word to stack slot + int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); + SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + + MachineMemOperand *MMO = + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), + MachineMemOperand::MOStore, 2, 2); + + SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; + SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, + DAG.getVTList(MVT::Other), + Ops, array_lengthof(Ops), MVT::i16, + MMO); + + // Load FP Control Word from stack slot + SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, + MachinePointerInfo(), false, false, false, 0); + + // Transform as necessary + SDValue CWD1 = + DAG.getNode(ISD::SRL, DL, MVT::i16, + DAG.getNode(ISD::AND, DL, MVT::i16, + CWD, DAG.getConstant(0x800, MVT::i16)), + DAG.getConstant(11, MVT::i8)); + SDValue CWD2 = + DAG.getNode(ISD::SRL, DL, MVT::i16, + DAG.getNode(ISD::AND, DL, MVT::i16, + CWD, DAG.getConstant(0x400, MVT::i16)), + DAG.getConstant(9, MVT::i8)); + + SDValue RetVal = + DAG.getNode(ISD::AND, DL, MVT::i16, + DAG.getNode(ISD::ADD, DL, MVT::i16, + DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), + DAG.getConstant(1, MVT::i16)), + DAG.getConstant(3, MVT::i16)); + + return DAG.getNode((VT.getSizeInBits() < 16 ? + ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); +} + +static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + EVT OpVT = VT; + unsigned NumBits = VT.getSizeInBits(); + DebugLoc dl = Op.getDebugLoc(); + + Op = Op.getOperand(0); + if (VT == MVT::i8) { + // Zero extend to i32 since there is not an i8 bsr. + OpVT = MVT::i32; + Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); + } + + // Issue a bsr (scan bits in reverse) which also sets EFLAGS. + SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); + Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); + + // If src is zero (i.e. bsr sets ZF), returns NumBits. + SDValue Ops[] = { + Op, + DAG.getConstant(NumBits+NumBits-1, OpVT), + DAG.getConstant(X86::COND_E, MVT::i8), + Op.getValue(1) + }; + Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); + + // Finally xor with NumBits-1. + Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); + + if (VT == MVT::i8) + Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); + return Op; +} + +static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + EVT OpVT = VT; + unsigned NumBits = VT.getSizeInBits(); + DebugLoc dl = Op.getDebugLoc(); + + Op = Op.getOperand(0); + if (VT == MVT::i8) { + // Zero extend to i32 since there is not an i8 bsr. + OpVT = MVT::i32; + Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); + } + + // Issue a bsr (scan bits in reverse). + SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); + Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); + + // And xor with NumBits-1. + Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); + + if (VT == MVT::i8) + Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); + return Op; +} + +static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + unsigned NumBits = VT.getSizeInBits(); + DebugLoc dl = Op.getDebugLoc(); + Op = Op.getOperand(0); + + // Issue a bsf (scan bits forward) which also sets EFLAGS. + SDVTList VTs = DAG.getVTList(VT, MVT::i32); + Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); + + // If src is zero (i.e. bsf sets ZF), returns NumBits. + SDValue Ops[] = { + Op, + DAG.getConstant(NumBits, VT), + DAG.getConstant(X86::COND_E, MVT::i8), + Op.getValue(1) + }; + return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops)); +} + +// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit +// ones, and then concatenate the result back. +static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + + assert(VT.is256BitVector() && VT.isInteger() && + "Unsupported value type for operation"); + + unsigned NumElems = VT.getVectorNumElements(); + DebugLoc dl = Op.getDebugLoc(); + + // Extract the LHS vectors + SDValue LHS = Op.getOperand(0); + SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); + SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); + + // Extract the RHS vectors + SDValue RHS = Op.getOperand(1); + SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); + SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); + + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), + DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); +} + +static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { + assert(Op.getValueType().is256BitVector() && + Op.getValueType().isInteger() && + "Only handle AVX 256-bit vector integer operation"); + return Lower256IntArith(Op, DAG); +} + +static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { + assert(Op.getValueType().is256BitVector() && + Op.getValueType().isInteger() && + "Only handle AVX 256-bit vector integer operation"); + return Lower256IntArith(Op, DAG); +} + +static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + DebugLoc dl = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + + // Decompose 256-bit ops into smaller 128-bit ops. + if (VT.is256BitVector() && !Subtarget->hasInt256()) + return Lower256IntArith(Op, DAG); + + SDValue A = Op.getOperand(0); + SDValue B = Op.getOperand(1); + + // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle. + if (VT == MVT::v4i32) { + assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() && + "Should not custom lower when pmuldq is available!"); + + // Extract the odd parts. + const int UnpackMask[] = { 1, -1, 3, -1 }; + SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask); + SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask); + + // Multiply the even parts. + SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B); + // Now multiply odd parts. + SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds); + + Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens); + Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds); + + // Merge the two vectors back together with a shuffle. This expands into 2 + // shuffles. + const int ShufMask[] = { 0, 4, 2, 6 }; + return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask); + } + + assert((VT == MVT::v2i64 || VT == MVT::v4i64) && + "Only know how to lower V2I64/V4I64 multiply"); + + // Ahi = psrlqi(a, 32); + // Bhi = psrlqi(b, 32); + // + // AloBlo = pmuludq(a, b); + // AloBhi = pmuludq(a, Bhi); + // AhiBlo = pmuludq(Ahi, b); + + // AloBhi = psllqi(AloBhi, 32); + // AhiBlo = psllqi(AhiBlo, 32); + // return AloBlo + AloBhi + AhiBlo; + + SDValue ShAmt = DAG.getConstant(32, MVT::i32); + + SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt); + SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt); + + // Bit cast to 32-bit vectors for MULUDQ + EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32; + A = DAG.getNode(ISD::BITCAST, dl, MulVT, A); + B = DAG.getNode(ISD::BITCAST, dl, MulVT, B); + Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi); + Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi); + + SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); + SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); + SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); + + AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt); + AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt); + + SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); + return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); +} + +SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + EVT EltTy = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + SDValue N0 = Op.getOperand(0); + DebugLoc dl = Op.getDebugLoc(); + + // Lower sdiv X, pow2-const. + BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1)); + if (!C) + return SDValue(); + + APInt SplatValue, SplatUndef; + unsigned MinSplatBits; + bool HasAnyUndefs; + if (!C->isConstantSplat(SplatValue, SplatUndef, MinSplatBits, HasAnyUndefs)) + return SDValue(); + + if ((SplatValue != 0) && + (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) { + unsigned lg2 = SplatValue.countTrailingZeros(); + // Splat the sign bit. + SDValue Sz = DAG.getConstant(EltTy.getSizeInBits()-1, MVT::i32); + SDValue SGN = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, N0, Sz, DAG); + // Add (N0 < 0) ? abs2 - 1 : 0; + SDValue Amt = DAG.getConstant(EltTy.getSizeInBits() - lg2, MVT::i32); + SDValue SRL = getTargetVShiftNode(X86ISD::VSRLI, dl, VT, SGN, Amt, DAG); + SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL); + SDValue Lg2Amt = DAG.getConstant(lg2, MVT::i32); + SDValue SRA = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, ADD, Lg2Amt, DAG); + + // If we're dividing by a positive value, we're done. Otherwise, we must + // negate the result. + if (SplatValue.isNonNegative()) + return SRA; + + SmallVector<SDValue, 16> V(NumElts, DAG.getConstant(0, EltTy)); + SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], NumElts); + return DAG.getNode(ISD::SUB, dl, VT, Zero, SRA); + } + return SDValue(); +} + +static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + SDValue R = Op.getOperand(0); + SDValue Amt = Op.getOperand(1); + + // Optimize shl/srl/sra with constant shift amount. + if (isSplatVector(Amt.getNode())) { + SDValue SclrAmt = Amt->getOperand(0); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { + uint64_t ShiftAmt = C->getZExtValue(); + + if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || + (Subtarget->hasInt256() && + (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) { + if (Op.getOpcode() == ISD::SHL) + return DAG.getNode(X86ISD::VSHLI, dl, VT, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + if (Op.getOpcode() == ISD::SRL) + return DAG.getNode(X86ISD::VSRLI, dl, VT, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64) + return DAG.getNode(X86ISD::VSRAI, dl, VT, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + } + + if (VT == MVT::v16i8) { + if (Op.getOpcode() == ISD::SHL) { + // Make a large shift. + SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); + // Zero out the rightmost bits. + SmallVector<SDValue, 16> V(16, + DAG.getConstant(uint8_t(-1U << ShiftAmt), + MVT::i8)); + return DAG.getNode(ISD::AND, dl, VT, SHL, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); + } + if (Op.getOpcode() == ISD::SRL) { + // Make a large shift. + SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); + // Zero out the leftmost bits. + SmallVector<SDValue, 16> V(16, + DAG.getConstant(uint8_t(-1U) >> ShiftAmt, + MVT::i8)); + return DAG.getNode(ISD::AND, dl, VT, SRL, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); + } + if (Op.getOpcode() == ISD::SRA) { + if (ShiftAmt == 7) { + // R s>> 7 === R s< 0 + SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); + } + + // R s>> a === ((R u>> a) ^ m) - m + SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); + SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt, + MVT::i8)); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16); + Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); + Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); + return Res; + } + llvm_unreachable("Unknown shift opcode."); + } + + if (Subtarget->hasInt256() && VT == MVT::v32i8) { + if (Op.getOpcode() == ISD::SHL) { + // Make a large shift. + SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); + // Zero out the rightmost bits. + SmallVector<SDValue, 32> V(32, + DAG.getConstant(uint8_t(-1U << ShiftAmt), + MVT::i8)); + return DAG.getNode(ISD::AND, dl, VT, SHL, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); + } + if (Op.getOpcode() == ISD::SRL) { + // Make a large shift. + SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); + // Zero out the leftmost bits. + SmallVector<SDValue, 32> V(32, + DAG.getConstant(uint8_t(-1U) >> ShiftAmt, + MVT::i8)); + return DAG.getNode(ISD::AND, dl, VT, SRL, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); + } + if (Op.getOpcode() == ISD::SRA) { + if (ShiftAmt == 7) { + // R s>> 7 === R s< 0 + SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); + } + + // R s>> a === ((R u>> a) ^ m) - m + SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); + SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt, + MVT::i8)); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32); + Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); + Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); + return Res; + } + llvm_unreachable("Unknown shift opcode."); + } + } + } + + // Special case in 32-bit mode, where i64 is expanded into high and low parts. + if (!Subtarget->is64Bit() && + (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && + Amt.getOpcode() == ISD::BITCAST && + Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { + Amt = Amt.getOperand(0); + unsigned Ratio = Amt.getValueType().getVectorNumElements() / + VT.getVectorNumElements(); + unsigned RatioInLog2 = Log2_32_Ceil(Ratio); + uint64_t ShiftAmt = 0; + for (unsigned i = 0; i != Ratio; ++i) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i)); + if (C == 0) + return SDValue(); + // 6 == Log2(64) + ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2))); + } + // Check remaining shift amounts. + for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { + uint64_t ShAmt = 0; + for (unsigned j = 0; j != Ratio; ++j) { + ConstantSDNode *C = + dyn_cast<ConstantSDNode>(Amt.getOperand(i + j)); + if (C == 0) + return SDValue(); + // 6 == Log2(64) + ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); + } + if (ShAmt != ShiftAmt) + return SDValue(); + } + switch (Op.getOpcode()) { + default: + llvm_unreachable("Unknown shift opcode!"); + case ISD::SHL: + return DAG.getNode(X86ISD::VSHLI, dl, VT, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + case ISD::SRL: + return DAG.getNode(X86ISD::VSRLI, dl, VT, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + case ISD::SRA: + return DAG.getNode(X86ISD::VSRAI, dl, VT, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + } + } + + return SDValue(); +} + +static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, + const X86Subtarget* Subtarget) { + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + SDValue R = Op.getOperand(0); + SDValue Amt = Op.getOperand(1); + + if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) || + VT == MVT::v4i32 || VT == MVT::v8i16 || + (Subtarget->hasInt256() && + ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) || + VT == MVT::v8i32 || VT == MVT::v16i16))) { + SDValue BaseShAmt; + EVT EltVT = VT.getVectorElementType(); + + if (Amt.getOpcode() == ISD::BUILD_VECTOR) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned i, j; + for (i = 0; i != NumElts; ++i) { + if (Amt.getOperand(i).getOpcode() == ISD::UNDEF) + continue; + break; + } + for (j = i; j != NumElts; ++j) { + SDValue Arg = Amt.getOperand(j); + if (Arg.getOpcode() == ISD::UNDEF) continue; + if (Arg != Amt.getOperand(i)) + break; + } + if (i != NumElts && j == NumElts) + BaseShAmt = Amt.getOperand(i); + } else { + if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) + Amt = Amt.getOperand(0); + if (Amt.getOpcode() == ISD::VECTOR_SHUFFLE && + cast<ShuffleVectorSDNode>(Amt)->isSplat()) { + SDValue InVec = Amt.getOperand(0); + if (InVec.getOpcode() == ISD::BUILD_VECTOR) { + unsigned NumElts = InVec.getValueType().getVectorNumElements(); + unsigned i = 0; + for (; i != NumElts; ++i) { + SDValue Arg = InVec.getOperand(i); + if (Arg.getOpcode() == ISD::UNDEF) continue; + BaseShAmt = Arg; + break; + } + } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { + if (ConstantSDNode *C = + dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { + unsigned SplatIdx = + cast<ShuffleVectorSDNode>(Amt)->getSplatIndex(); + if (C->getZExtValue() == SplatIdx) + BaseShAmt = InVec.getOperand(1); + } + } + if (BaseShAmt.getNode() == 0) + BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt, + DAG.getIntPtrConstant(0)); + } + } + + if (BaseShAmt.getNode()) { + if (EltVT.bitsGT(MVT::i32)) + BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt); + else if (EltVT.bitsLT(MVT::i32)) + BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); + + switch (Op.getOpcode()) { + default: + llvm_unreachable("Unknown shift opcode!"); + case ISD::SHL: + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v2i64: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v4i64: + case MVT::v8i32: + case MVT::v16i16: + return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG); + } + case ISD::SRA: + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v4i32: + case MVT::v8i16: + case MVT::v8i32: + case MVT::v16i16: + return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG); + } + case ISD::SRL: + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v2i64: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v4i64: + case MVT::v8i32: + case MVT::v16i16: + return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG); + } + } + } + } + + // Special case in 32-bit mode, where i64 is expanded into high and low parts. + if (!Subtarget->is64Bit() && + (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && + Amt.getOpcode() == ISD::BITCAST && + Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { + Amt = Amt.getOperand(0); + unsigned Ratio = Amt.getValueType().getVectorNumElements() / + VT.getVectorNumElements(); + std::vector<SDValue> Vals(Ratio); + for (unsigned i = 0; i != Ratio; ++i) + Vals[i] = Amt.getOperand(i); + for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { + for (unsigned j = 0; j != Ratio; ++j) + if (Vals[j] != Amt.getOperand(i + j)) + return SDValue(); + } + switch (Op.getOpcode()) { + default: + llvm_unreachable("Unknown shift opcode!"); + case ISD::SHL: + return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1)); + case ISD::SRL: + return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1)); + case ISD::SRA: + return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1)); + } + } + + return SDValue(); +} + +SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { + + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + SDValue R = Op.getOperand(0); + SDValue Amt = Op.getOperand(1); + SDValue V; + + if (!Subtarget->hasSSE2()) + return SDValue(); + + V = LowerScalarImmediateShift(Op, DAG, Subtarget); + if (V.getNode()) + return V; + + V = LowerScalarVariableShift(Op, DAG, Subtarget); + if (V.getNode()) + return V; + + // AVX2 has VPSLLV/VPSRAV/VPSRLV. + if (Subtarget->hasInt256()) { + if (Op.getOpcode() == ISD::SRL && + (VT == MVT::v2i64 || VT == MVT::v4i32 || + VT == MVT::v4i64 || VT == MVT::v8i32)) + return Op; + if (Op.getOpcode() == ISD::SHL && + (VT == MVT::v2i64 || VT == MVT::v4i32 || + VT == MVT::v4i64 || VT == MVT::v8i32)) + return Op; + if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32)) + return Op; + } + + // Lower SHL with variable shift amount. + if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { + Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT)); + + Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT)); + Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); + Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); + return DAG.getNode(ISD::MUL, dl, VT, Op, R); + } + if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { + assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq."); + + // a = a << 5; + Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT)); + Op = DAG.getNode(ISD::BITCAST, dl, VT, Op); + + // Turn 'a' into a mask suitable for VSELECT + SDValue VSelM = DAG.getConstant(0x80, VT); + SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); + OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); + + SDValue CM1 = DAG.getConstant(0x0f, VT); + SDValue CM2 = DAG.getConstant(0x3f, VT); + + // r = VSELECT(r, psllw(r & (char16)15, 4), a); + SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1); + M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M, + DAG.getConstant(4, MVT::i32), DAG); + M = DAG.getNode(ISD::BITCAST, dl, VT, M); + R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); + + // a += a + Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); + OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); + OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); + + // r = VSELECT(r, psllw(r & (char16)63, 2), a); + M = DAG.getNode(ISD::AND, dl, VT, R, CM2); + M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M, + DAG.getConstant(2, MVT::i32), DAG); + M = DAG.getNode(ISD::BITCAST, dl, VT, M); + R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); + + // a += a + Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); + OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); + OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); + + // return VSELECT(r, r+r, a); + R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, + DAG.getNode(ISD::ADD, dl, VT, R, R), R); + return R; + } + + // Decompose 256-bit shifts into smaller 128-bit shifts. + if (VT.is256BitVector()) { + unsigned NumElems = VT.getVectorNumElements(); + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + + // Extract the two vectors + SDValue V1 = Extract128BitVector(R, 0, DAG, dl); + SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl); + + // Recreate the shift amount vectors + SDValue Amt1, Amt2; + if (Amt.getOpcode() == ISD::BUILD_VECTOR) { + // Constant shift amount + SmallVector<SDValue, 4> Amt1Csts; + SmallVector<SDValue, 4> Amt2Csts; + for (unsigned i = 0; i != NumElems/2; ++i) + Amt1Csts.push_back(Amt->getOperand(i)); + for (unsigned i = NumElems/2; i != NumElems; ++i) + Amt2Csts.push_back(Amt->getOperand(i)); + + Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, + &Amt1Csts[0], NumElems/2); + Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, + &Amt2Csts[0], NumElems/2); + } else { + // Variable shift amount + Amt1 = Extract128BitVector(Amt, 0, DAG, dl); + Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl); + } + + // Issue new vector shifts for the smaller types + V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); + V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); + + // Concatenate the result back + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); + } + + return SDValue(); +} + +static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { + // Lower the "add/sub/mul with overflow" instruction into a regular ins plus + // a "setcc" instruction that checks the overflow flag. The "brcond" lowering + // looks for this combo and may remove the "setcc" instruction if the "setcc" + // has only one use. + SDNode *N = Op.getNode(); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + unsigned BaseOp = 0; + unsigned Cond = 0; + DebugLoc DL = Op.getDebugLoc(); + switch (Op.getOpcode()) { + default: llvm_unreachable("Unknown ovf instruction!"); + case ISD::SADDO: + // A subtract of one will be selected as a INC. Note that INC doesn't + // set CF, so we can't do this for UADDO. + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) + if (C->isOne()) { + BaseOp = X86ISD::INC; + Cond = X86::COND_O; + break; + } + BaseOp = X86ISD::ADD; + Cond = X86::COND_O; + break; + case ISD::UADDO: + BaseOp = X86ISD::ADD; + Cond = X86::COND_B; + break; + case ISD::SSUBO: + // A subtract of one will be selected as a DEC. Note that DEC doesn't + // set CF, so we can't do this for USUBO. + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) + if (C->isOne()) { + BaseOp = X86ISD::DEC; + Cond = X86::COND_O; + break; + } + BaseOp = X86ISD::SUB; + Cond = X86::COND_O; + break; + case ISD::USUBO: + BaseOp = X86ISD::SUB; + Cond = X86::COND_B; + break; + case ISD::SMULO: + BaseOp = X86ISD::SMUL; + Cond = X86::COND_O; + break; + case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs + SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), + MVT::i32); + SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); + + SDValue SetCC = + DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(X86::COND_O, MVT::i32), + SDValue(Sum.getNode(), 2)); + + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); + } + } + + // Also sets EFLAGS. + SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); + SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); + + SDValue SetCC = + DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), + DAG.getConstant(Cond, MVT::i32), + SDValue(Sum.getNode(), 1)); + + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); +} + +SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc dl = Op.getDebugLoc(); + EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); + EVT VT = Op.getValueType(); + + if (!Subtarget->hasSSE2() || !VT.isVector()) + return SDValue(); + + unsigned BitsDiff = VT.getScalarType().getSizeInBits() - + ExtraVT.getScalarType().getSizeInBits(); + SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32); + + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v8i32: + case MVT::v16i16: + if (!Subtarget->hasFp256()) + return SDValue(); + if (!Subtarget->hasInt256()) { + // needs to be split + unsigned NumElems = VT.getVectorNumElements(); + + // Extract the LHS vectors + SDValue LHS = Op.getOperand(0); + SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); + SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); + + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + + EVT ExtraEltVT = ExtraVT.getVectorElementType(); + unsigned ExtraNumElems = ExtraVT.getVectorNumElements(); + ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT, + ExtraNumElems/2); + SDValue Extra = DAG.getValueType(ExtraVT); + + LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra); + LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2); + } + // fall through + case MVT::v4i32: + case MVT::v8i16: { + // (sext (vzext x)) -> (vsext x) + SDValue Op0 = Op.getOperand(0); + SDValue Op00 = Op0.getOperand(0); + SDValue Tmp1; + // Hopefully, this VECTOR_SHUFFLE is just a VZEXT. + if (Op0.getOpcode() == ISD::BITCAST && + Op00.getOpcode() == ISD::VECTOR_SHUFFLE) + Tmp1 = LowerVectorIntExtend(Op00, DAG); + if (Tmp1.getNode()) { + SDValue Tmp1Op0 = Tmp1.getOperand(0); + assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT && + "This optimization is invalid without a VZEXT."); + return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0)); + } + + // If the above didn't work, then just use Shift-Left + Shift-Right. + Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT, Op0, ShAmt, DAG); + return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG); + } + } +} + +static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + DebugLoc dl = Op.getDebugLoc(); + AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( + cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); + SynchronizationScope FenceScope = static_cast<SynchronizationScope>( + cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); + + // The only fence that needs an instruction is a sequentially-consistent + // cross-thread fence. + if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { + // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for + // no-sse2). There isn't any reason to disable it if the target processor + // supports it. + if (Subtarget->hasSSE2() || Subtarget->is64Bit()) + return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); + + SDValue Chain = Op.getOperand(0); + SDValue Zero = DAG.getConstant(0, MVT::i32); + SDValue Ops[] = { + DAG.getRegister(X86::ESP, MVT::i32), // Base + DAG.getTargetConstant(1, MVT::i8), // Scale + DAG.getRegister(0, MVT::i32), // Index + DAG.getTargetConstant(0, MVT::i32), // Disp + DAG.getRegister(0, MVT::i32), // Segment. + Zero, + Chain + }; + SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops); + return SDValue(Res, 0); + } + + // MEMBARRIER is a compiler barrier; it codegens to a no-op. + return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); +} + +static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + EVT T = Op.getValueType(); + DebugLoc DL = Op.getDebugLoc(); + unsigned Reg = 0; + unsigned size = 0; + switch(T.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Invalid value type!"); + case MVT::i8: Reg = X86::AL; size = 1; break; + case MVT::i16: Reg = X86::AX; size = 2; break; + case MVT::i32: Reg = X86::EAX; size = 4; break; + case MVT::i64: + assert(Subtarget->is64Bit() && "Node not type legal!"); + Reg = X86::RAX; size = 8; + break; + } + SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, + Op.getOperand(2), SDValue()); + SDValue Ops[] = { cpIn.getValue(0), + Op.getOperand(1), + Op.getOperand(3), + DAG.getTargetConstant(size, MVT::i8), + cpIn.getValue(1) }; + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); + SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, + Ops, array_lengthof(Ops), T, MMO); + SDValue cpOut = + DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); + return cpOut; +} + +static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(Subtarget->is64Bit() && "Result not type legalized?"); + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue TheChain = Op.getOperand(0); + DebugLoc dl = Op.getDebugLoc(); + SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); + SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); + SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, + rax.getValue(2)); + SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, + DAG.getConstant(32, MVT::i8)); + SDValue Ops[] = { + DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), + rdx.getValue(1) + }; + return DAG.getMergeValues(Ops, array_lengthof(Ops), dl); +} + +SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { + EVT SrcVT = Op.getOperand(0).getValueType(); + EVT DstVT = Op.getValueType(); + assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && + Subtarget->hasMMX() && "Unexpected custom BITCAST"); + assert((DstVT == MVT::i64 || + (DstVT.isVector() && DstVT.getSizeInBits()==64)) && + "Unexpected custom BITCAST"); + // i64 <=> MMX conversions are Legal. + if (SrcVT==MVT::i64 && DstVT.isVector()) + return Op; + if (DstVT==MVT::i64 && SrcVT.isVector()) + return Op; + // MMX <=> MMX conversions are Legal. + if (SrcVT.isVector() && DstVT.isVector()) + return Op; + // All other conversions need to be expanded. + return SDValue(); +} + +static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { + SDNode *Node = Op.getNode(); + DebugLoc dl = Node->getDebugLoc(); + EVT T = Node->getValueType(0); + SDValue negOp = DAG.getNode(ISD::SUB, dl, T, + DAG.getConstant(0, T), Node->getOperand(2)); + return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, + cast<AtomicSDNode>(Node)->getMemoryVT(), + Node->getOperand(0), + Node->getOperand(1), negOp, + cast<AtomicSDNode>(Node)->getSrcValue(), + cast<AtomicSDNode>(Node)->getAlignment(), + cast<AtomicSDNode>(Node)->getOrdering(), + cast<AtomicSDNode>(Node)->getSynchScope()); +} + +static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { + SDNode *Node = Op.getNode(); + DebugLoc dl = Node->getDebugLoc(); + EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); + + // Convert seq_cst store -> xchg + // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) + // FIXME: On 32-bit, store -> fist or movq would be more efficient + // (The only way to get a 16-byte store is cmpxchg16b) + // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. + if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent || + !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { + SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, + cast<AtomicSDNode>(Node)->getMemoryVT(), + Node->getOperand(0), + Node->getOperand(1), Node->getOperand(2), + cast<AtomicSDNode>(Node)->getMemOperand(), + cast<AtomicSDNode>(Node)->getOrdering(), + cast<AtomicSDNode>(Node)->getSynchScope()); + return Swap.getValue(1); + } + // Other atomic stores have a simple pattern. + return Op; +} + +static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getNode()->getValueType(0); + + // Let legalize expand this if it isn't a legal type yet. + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + SDVTList VTs = DAG.getVTList(VT, MVT::i32); + + unsigned Opc; + bool ExtraOp = false; + switch (Op.getOpcode()) { + default: llvm_unreachable("Invalid code"); + case ISD::ADDC: Opc = X86ISD::ADD; break; + case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; + case ISD::SUBC: Opc = X86ISD::SUB; break; + case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; + } + + if (!ExtraOp) + return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), + Op.getOperand(1)); + return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), + Op.getOperand(1), Op.getOperand(2)); +} + +SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { + assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit()); + + // For MacOSX, we want to call an alternative entry point: __sincos_stret, + // which returns the values as { float, float } (in XMM0) or + // { double, double } (which is returned in XMM0, XMM1). + DebugLoc dl = Op.getDebugLoc(); + SDValue Arg = Op.getOperand(0); + EVT ArgVT = Arg.getValueType(); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + + ArgListTy Args; + ArgListEntry Entry; + + Entry.Node = Arg; + Entry.Ty = ArgTy; + Entry.isSExt = false; + Entry.isZExt = false; + Args.push_back(Entry); + + bool isF64 = ArgVT == MVT::f64; + // Only optimize x86_64 for now. i386 is a bit messy. For f32, + // the small struct {f32, f32} is returned in (eax, edx). For f64, + // the results are returned via SRet in memory. + const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret"; + SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy()); + + Type *RetTy = isF64 + ? (Type*)StructType::get(ArgTy, ArgTy, NULL) + : (Type*)VectorType::get(ArgTy, 4); + TargetLowering:: + CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, + false, false, false, false, 0, + CallingConv::C, /*isTaillCall=*/false, + /*doesNotRet=*/false, /*isReturnValueUsed*/true, + Callee, Args, DAG, dl); + std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); + + if (isF64) + // Returned in xmm0 and xmm1. + return CallResult.first; + + // Returned in bits 0:31 and 32:64 xmm0. + SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, + CallResult.first, DAG.getIntPtrConstant(0)); + SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, + CallResult.first, DAG.getIntPtrConstant(1)); + SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); + return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); +} + +/// LowerOperation - Provide custom lowering hooks for some operations. +/// +SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + default: llvm_unreachable("Should not custom lower this!"); + case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); + case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); + case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op, Subtarget, DAG); + case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); + case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); + case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); + case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); + case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); + case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); + case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); + case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); + case ISD::ConstantPool: return LowerConstantPool(Op, DAG); + case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); + case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); + case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); + case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); + case ISD::SHL_PARTS: + case ISD::SRA_PARTS: + case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); + case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); + case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); + case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); + case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG); + case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG); + case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, DAG); + case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); + case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); + case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); + case ISD::FABS: return LowerFABS(Op, DAG); + case ISD::FNEG: return LowerFNEG(Op, DAG); + case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); + case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); + case ISD::SETCC: return LowerSETCC(Op, DAG); + case ISD::SELECT: return LowerSELECT(Op, DAG); + case ISD::BRCOND: return LowerBRCOND(Op, DAG); + case ISD::JumpTable: return LowerJumpTable(Op, DAG); + case ISD::VASTART: return LowerVASTART(Op, DAG); + case ISD::VAARG: return LowerVAARG(Op, DAG); + case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); + case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); + case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); + case ISD::FRAME_TO_ARGS_OFFSET: + return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); + case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); + case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); + case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); + case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); + case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); + case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); + case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); + case ISD::CTLZ: return LowerCTLZ(Op, DAG); + case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); + case ISD::CTTZ: return LowerCTTZ(Op, DAG); + case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); + case ISD::SRA: + case ISD::SRL: + case ISD::SHL: return LowerShift(Op, DAG); + case ISD::SADDO: + case ISD::UADDO: + case ISD::SSUBO: + case ISD::USUBO: + case ISD::SMULO: + case ISD::UMULO: return LowerXALUO(Op, DAG); + case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); + case ISD::BITCAST: return LowerBITCAST(Op, DAG); + case ISD::ADDC: + case ISD::ADDE: + case ISD::SUBC: + case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); + case ISD::ADD: return LowerADD(Op, DAG); + case ISD::SUB: return LowerSUB(Op, DAG); + case ISD::SDIV: return LowerSDIV(Op, DAG); + case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); + } +} + +static void ReplaceATOMIC_LOAD(SDNode *Node, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) { + DebugLoc dl = Node->getDebugLoc(); + EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); + + // Convert wide load -> cmpxchg8b/cmpxchg16b + // FIXME: On 32-bit, load -> fild or movq would be more efficient + // (The only way to get a 16-byte load is cmpxchg16b) + // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment. + SDValue Zero = DAG.getConstant(0, VT); + SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT, + Node->getOperand(0), + Node->getOperand(1), Zero, Zero, + cast<AtomicSDNode>(Node)->getMemOperand(), + cast<AtomicSDNode>(Node)->getOrdering(), + cast<AtomicSDNode>(Node)->getSynchScope()); + Results.push_back(Swap.getValue(0)); + Results.push_back(Swap.getValue(1)); +} + +static void +ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, + SelectionDAG &DAG, unsigned NewOp) { + DebugLoc dl = Node->getDebugLoc(); + assert (Node->getValueType(0) == MVT::i64 && + "Only know how to expand i64 atomics"); + + SDValue Chain = Node->getOperand(0); + SDValue In1 = Node->getOperand(1); + SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, + Node->getOperand(2), DAG.getIntPtrConstant(0)); + SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, + Node->getOperand(2), DAG.getIntPtrConstant(1)); + SDValue Ops[] = { Chain, In1, In2L, In2H }; + SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); + SDValue Result = + DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, array_lengthof(Ops), MVT::i64, + cast<MemSDNode>(Node)->getMemOperand()); + SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); + Results.push_back(Result.getValue(2)); +} + +/// ReplaceNodeResults - Replace a node with an illegal result type +/// with a new node built out of custom code. +void X86TargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl<SDValue>&Results, + SelectionDAG &DAG) const { + DebugLoc dl = N->getDebugLoc(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + switch (N->getOpcode()) { + default: + llvm_unreachable("Do not know how to custom type legalize this operation!"); + case ISD::SIGN_EXTEND_INREG: + case ISD::ADDC: + case ISD::ADDE: + case ISD::SUBC: + case ISD::SUBE: + // We don't want to expand or promote these. + return; + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: { + bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; + + if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType())) + return; + + std::pair<SDValue,SDValue> Vals = + FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); + SDValue FIST = Vals.first, StackSlot = Vals.second; + if (FIST.getNode() != 0) { + EVT VT = N->getValueType(0); + // Return a load from the stack slot. + if (StackSlot.getNode() != 0) + Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, + MachinePointerInfo(), + false, false, false, 0)); + else + Results.push_back(FIST); + } + return; + } + case ISD::UINT_TO_FP: { + assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); + if (N->getOperand(0).getValueType() != MVT::v2i32 || + N->getValueType(0) != MVT::v2f32) + return; + SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, + N->getOperand(0)); + SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), + MVT::f64); + SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias); + SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias)); + Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or); + SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); + Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); + return; + } + case ISD::FP_ROUND: { + if (!TLI.isTypeLegal(N->getOperand(0).getValueType())) + return; + SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); + Results.push_back(V); + return; + } + case ISD::READCYCLECOUNTER: { + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue TheChain = N->getOperand(0); + SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); + SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, + rd.getValue(1)); + SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, + eax.getValue(2)); + // Use a buildpair to merge the two 32-bit values into a 64-bit one. + SDValue Ops[] = { eax, edx }; + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, + array_lengthof(Ops))); + Results.push_back(edx.getValue(1)); + return; + } + case ISD::ATOMIC_CMP_SWAP: { + EVT T = N->getValueType(0); + assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); + bool Regs64bit = T == MVT::i128; + EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; + SDValue cpInL, cpInH; + cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), + DAG.getConstant(0, HalfT)); + cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), + DAG.getConstant(1, HalfT)); + cpInL = DAG.getCopyToReg(N->getOperand(0), dl, + Regs64bit ? X86::RAX : X86::EAX, + cpInL, SDValue()); + cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, + Regs64bit ? X86::RDX : X86::EDX, + cpInH, cpInL.getValue(1)); + SDValue swapInL, swapInH; + swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), + DAG.getConstant(0, HalfT)); + swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), + DAG.getConstant(1, HalfT)); + swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, + Regs64bit ? X86::RBX : X86::EBX, + swapInL, cpInH.getValue(1)); + swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, + Regs64bit ? X86::RCX : X86::ECX, + swapInH, swapInL.getValue(1)); + SDValue Ops[] = { swapInH.getValue(0), + N->getOperand(1), + swapInH.getValue(1) }; + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); + unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : + X86ISD::LCMPXCHG8_DAG; + SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, + Ops, array_lengthof(Ops), T, MMO); + SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, + Regs64bit ? X86::RAX : X86::EAX, + HalfT, Result.getValue(1)); + SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, + Regs64bit ? X86::RDX : X86::EDX, + HalfT, cpOutL.getValue(2)); + SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2)); + Results.push_back(cpOutH.getValue(1)); + return; + } + case ISD::ATOMIC_LOAD_ADD: + case ISD::ATOMIC_LOAD_AND: + case ISD::ATOMIC_LOAD_NAND: + case ISD::ATOMIC_LOAD_OR: + case ISD::ATOMIC_LOAD_SUB: + case ISD::ATOMIC_LOAD_XOR: + case ISD::ATOMIC_LOAD_MAX: + case ISD::ATOMIC_LOAD_MIN: + case ISD::ATOMIC_LOAD_UMAX: + case ISD::ATOMIC_LOAD_UMIN: + case ISD::ATOMIC_SWAP: { + unsigned Opc; + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected opcode"); + case ISD::ATOMIC_LOAD_ADD: + Opc = X86ISD::ATOMADD64_DAG; + break; + case ISD::ATOMIC_LOAD_AND: + Opc = X86ISD::ATOMAND64_DAG; + break; + case ISD::ATOMIC_LOAD_NAND: + Opc = X86ISD::ATOMNAND64_DAG; + break; + case ISD::ATOMIC_LOAD_OR: + Opc = X86ISD::ATOMOR64_DAG; + break; + case ISD::ATOMIC_LOAD_SUB: + Opc = X86ISD::ATOMSUB64_DAG; + break; + case ISD::ATOMIC_LOAD_XOR: + Opc = X86ISD::ATOMXOR64_DAG; + break; + case ISD::ATOMIC_LOAD_MAX: + Opc = X86ISD::ATOMMAX64_DAG; + break; + case ISD::ATOMIC_LOAD_MIN: + Opc = X86ISD::ATOMMIN64_DAG; + break; + case ISD::ATOMIC_LOAD_UMAX: + Opc = X86ISD::ATOMUMAX64_DAG; + break; + case ISD::ATOMIC_LOAD_UMIN: + Opc = X86ISD::ATOMUMIN64_DAG; + break; + case ISD::ATOMIC_SWAP: + Opc = X86ISD::ATOMSWAP64_DAG; + break; + } + ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc); + return; + } + case ISD::ATOMIC_LOAD: + ReplaceATOMIC_LOAD(N, Results, DAG); + } +} + +const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { + switch (Opcode) { + default: return NULL; + case X86ISD::BSF: return "X86ISD::BSF"; + case X86ISD::BSR: return "X86ISD::BSR"; + case X86ISD::SHLD: return "X86ISD::SHLD"; + case X86ISD::SHRD: return "X86ISD::SHRD"; + case X86ISD::FAND: return "X86ISD::FAND"; + case X86ISD::FOR: return "X86ISD::FOR"; + case X86ISD::FXOR: return "X86ISD::FXOR"; + case X86ISD::FSRL: return "X86ISD::FSRL"; + case X86ISD::FILD: return "X86ISD::FILD"; + case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; + case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; + case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; + case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; + case X86ISD::FLD: return "X86ISD::FLD"; + case X86ISD::FST: return "X86ISD::FST"; + case X86ISD::CALL: return "X86ISD::CALL"; + case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; + case X86ISD::BT: return "X86ISD::BT"; + case X86ISD::CMP: return "X86ISD::CMP"; + case X86ISD::COMI: return "X86ISD::COMI"; + case X86ISD::UCOMI: return "X86ISD::UCOMI"; + case X86ISD::SETCC: return "X86ISD::SETCC"; + case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; + case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd"; + case X86ISD::FSETCCss: return "X86ISD::FSETCCss"; + case X86ISD::CMOV: return "X86ISD::CMOV"; + case X86ISD::BRCOND: return "X86ISD::BRCOND"; + case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; + case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; + case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; + case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; + case X86ISD::Wrapper: return "X86ISD::Wrapper"; + case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; + case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; + case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; + case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; + case X86ISD::PINSRB: return "X86ISD::PINSRB"; + case X86ISD::PINSRW: return "X86ISD::PINSRW"; + case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; + case X86ISD::ANDNP: return "X86ISD::ANDNP"; + case X86ISD::PSIGN: return "X86ISD::PSIGN"; + case X86ISD::BLENDV: return "X86ISD::BLENDV"; + case X86ISD::BLENDI: return "X86ISD::BLENDI"; + case X86ISD::SUBUS: return "X86ISD::SUBUS"; + case X86ISD::HADD: return "X86ISD::HADD"; + case X86ISD::HSUB: return "X86ISD::HSUB"; + case X86ISD::FHADD: return "X86ISD::FHADD"; + case X86ISD::FHSUB: return "X86ISD::FHSUB"; + case X86ISD::UMAX: return "X86ISD::UMAX"; + case X86ISD::UMIN: return "X86ISD::UMIN"; + case X86ISD::SMAX: return "X86ISD::SMAX"; + case X86ISD::SMIN: return "X86ISD::SMIN"; + case X86ISD::FMAX: return "X86ISD::FMAX"; + case X86ISD::FMIN: return "X86ISD::FMIN"; + case X86ISD::FMAXC: return "X86ISD::FMAXC"; + case X86ISD::FMINC: return "X86ISD::FMINC"; + case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; + case X86ISD::FRCP: return "X86ISD::FRCP"; + case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; + case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; + case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; + case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP"; + case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP"; + case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; + case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; + case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; + case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r"; + case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; + case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; + case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; + case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; + case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; + case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; + case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; + case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; + case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; + case X86ISD::VSEXT_MOVL: return "X86ISD::VSEXT_MOVL"; + case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; + case X86ISD::VZEXT: return "X86ISD::VZEXT"; + case X86ISD::VSEXT: return "X86ISD::VSEXT"; + case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; + case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; + case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; + case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; + case X86ISD::VSHL: return "X86ISD::VSHL"; + case X86ISD::VSRL: return "X86ISD::VSRL"; + case X86ISD::VSRA: return "X86ISD::VSRA"; + case X86ISD::VSHLI: return "X86ISD::VSHLI"; + case X86ISD::VSRLI: return "X86ISD::VSRLI"; + case X86ISD::VSRAI: return "X86ISD::VSRAI"; + case X86ISD::CMPP: return "X86ISD::CMPP"; + case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; + case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; + case X86ISD::ADD: return "X86ISD::ADD"; + case X86ISD::SUB: return "X86ISD::SUB"; + case X86ISD::ADC: return "X86ISD::ADC"; + case X86ISD::SBB: return "X86ISD::SBB"; + case X86ISD::SMUL: return "X86ISD::SMUL"; + case X86ISD::UMUL: return "X86ISD::UMUL"; + case X86ISD::INC: return "X86ISD::INC"; + case X86ISD::DEC: return "X86ISD::DEC"; + case X86ISD::OR: return "X86ISD::OR"; + case X86ISD::XOR: return "X86ISD::XOR"; + case X86ISD::AND: return "X86ISD::AND"; + case X86ISD::BLSI: return "X86ISD::BLSI"; + case X86ISD::BLSMSK: return "X86ISD::BLSMSK"; + case X86ISD::BLSR: return "X86ISD::BLSR"; + case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; + case X86ISD::PTEST: return "X86ISD::PTEST"; + case X86ISD::TESTP: return "X86ISD::TESTP"; + case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; + case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; + case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; + case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; + case X86ISD::SHUFP: return "X86ISD::SHUFP"; + case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; + case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; + case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; + case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; + case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; + case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; + case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; + case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; + case X86ISD::MOVSD: return "X86ISD::MOVSD"; + case X86ISD::MOVSS: return "X86ISD::MOVSS"; + case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; + case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; + case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; + case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; + case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; + case X86ISD::VPERMV: return "X86ISD::VPERMV"; + case X86ISD::VPERMI: return "X86ISD::VPERMI"; + case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; + case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; + case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; + case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; + case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; + case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; + case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; + case X86ISD::SAHF: return "X86ISD::SAHF"; + case X86ISD::RDRAND: return "X86ISD::RDRAND"; + case X86ISD::RDSEED: return "X86ISD::RDSEED"; + case X86ISD::FMADD: return "X86ISD::FMADD"; + case X86ISD::FMSUB: return "X86ISD::FMSUB"; + case X86ISD::FNMADD: return "X86ISD::FNMADD"; + case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; + case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB"; + case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; + case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; + case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; + case X86ISD::XTEST: return "X86ISD::XTEST"; + } +} + +// isLegalAddressingMode - Return true if the addressing mode represented +// by AM is legal for this target, for a load/store of the specified type. +bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, + Type *Ty) const { + // X86 supports extremely general addressing modes. + CodeModel::Model M = getTargetMachine().getCodeModel(); + Reloc::Model R = getTargetMachine().getRelocationModel(); + + // X86 allows a sign-extended 32-bit immediate field as a displacement. + if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) + return false; + + if (AM.BaseGV) { + unsigned GVFlags = + Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); + + // If a reference to this global requires an extra load, we can't fold it. + if (isGlobalStubReference(GVFlags)) + return false; + + // If BaseGV requires a register for the PIC base, we cannot also have a + // BaseReg specified. + if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) + return false; + + // If lower 4G is not available, then we must use rip-relative addressing. + if ((M != CodeModel::Small || R != Reloc::Static) && + Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) + return false; + } + + switch (AM.Scale) { + case 0: + case 1: + case 2: + case 4: + case 8: + // These scales always work. + break; + case 3: + case 5: + case 9: + // These scales are formed with basereg+scalereg. Only accept if there is + // no basereg yet. + if (AM.HasBaseReg) + return false; + break; + default: // Other stuff never works. + return false; + } + + return true; +} + +bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { + if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) + return false; + unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); + unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); + return NumBits1 > NumBits2; +} + +bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { + return isInt<32>(Imm); +} + +bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const { + // Can also use sub to handle negated immediates. + return isInt<32>(Imm); +} + +bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { + if (!VT1.isInteger() || !VT2.isInteger()) + return false; + unsigned NumBits1 = VT1.getSizeInBits(); + unsigned NumBits2 = VT2.getSizeInBits(); + return NumBits1 > NumBits2; +} + +bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { + // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. + return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); +} + +bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { + // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. + return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); +} + +bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { + EVT VT1 = Val.getValueType(); + if (isZExtFree(VT1, VT2)) + return true; + + if (Val.getOpcode() != ISD::LOAD) + return false; + + if (!VT1.isSimple() || !VT1.isInteger() || + !VT2.isSimple() || !VT2.isInteger()) + return false; + + switch (VT1.getSimpleVT().SimpleTy) { + default: break; + case MVT::i8: + case MVT::i16: + case MVT::i32: + // X86 has 8, 16, and 32-bit zero-extending loads. + return true; + } + + return false; +} + +bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { + // i16 instructions are longer (0x66 prefix) and potentially slower. + return !(VT1 == MVT::i32 && VT2 == MVT::i16); +} + +/// isShuffleMaskLegal - Targets can use this to indicate that they only +/// support *some* VECTOR_SHUFFLE operations, those with specific masks. +/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values +/// are assumed to be legal. +bool +X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, + EVT VT) const { + // Very little shuffling can be done for 64-bit vectors right now. + if (VT.getSizeInBits() == 64) + return false; + + // FIXME: pshufb, blends, shifts. + return (VT.getVectorNumElements() == 2 || + ShuffleVectorSDNode::isSplatMask(&M[0], VT) || + isMOVLMask(M, VT) || + isSHUFPMask(M, VT, Subtarget->hasFp256()) || + isPSHUFDMask(M, VT) || + isPSHUFHWMask(M, VT, Subtarget->hasInt256()) || + isPSHUFLWMask(M, VT, Subtarget->hasInt256()) || + isPALIGNRMask(M, VT, Subtarget) || + isUNPCKLMask(M, VT, Subtarget->hasInt256()) || + isUNPCKHMask(M, VT, Subtarget->hasInt256()) || + isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasInt256()) || + isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasInt256())); +} + +bool +X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, + EVT VT) const { + unsigned NumElts = VT.getVectorNumElements(); + // FIXME: This collection of masks seems suspect. + if (NumElts == 2) + return true; + if (NumElts == 4 && VT.is128BitVector()) { + return (isMOVLMask(Mask, VT) || + isCommutedMOVLMask(Mask, VT, true) || + isSHUFPMask(Mask, VT, Subtarget->hasFp256()) || + isSHUFPMask(Mask, VT, Subtarget->hasFp256(), /* Commuted */ true)); + } + return false; +} + +//===----------------------------------------------------------------------===// +// X86 Scheduler Hooks +//===----------------------------------------------------------------------===// + +/// Utility function to emit xbegin specifying the start of an RTM region. +static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, + const TargetInstrInfo *TII) { + DebugLoc DL = MI->getDebugLoc(); + + const BasicBlock *BB = MBB->getBasicBlock(); + MachineFunction::iterator I = MBB; + ++I; + + // For the v = xbegin(), we generate + // + // thisMBB: + // xbegin sinkMBB + // + // mainMBB: + // eax = -1 + // + // sinkMBB: + // v = eax + + MachineBasicBlock *thisMBB = MBB; + MachineFunction *MF = MBB->getParent(); + MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); + MF->insert(I, mainMBB); + MF->insert(I, sinkMBB); + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), MBB, + llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // thisMBB: + // xbegin sinkMBB + // # fallthrough to mainMBB + // # abortion to sinkMBB + BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB); + thisMBB->addSuccessor(mainMBB); + thisMBB->addSuccessor(sinkMBB); + + // mainMBB: + // EAX = -1 + BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1); + mainMBB->addSuccessor(sinkMBB); + + // sinkMBB: + // EAX is live into the sinkMBB + sinkMBB->addLiveIn(X86::EAX); + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) + .addReg(X86::EAX); + + MI->eraseFromParent(); + return sinkMBB; +} + +// Get CMPXCHG opcode for the specified data type. +static unsigned getCmpXChgOpcode(EVT VT) { + switch (VT.getSimpleVT().SimpleTy) { + case MVT::i8: return X86::LCMPXCHG8; + case MVT::i16: return X86::LCMPXCHG16; + case MVT::i32: return X86::LCMPXCHG32; + case MVT::i64: return X86::LCMPXCHG64; + default: + break; + } + llvm_unreachable("Invalid operand size!"); +} + +// Get LOAD opcode for the specified data type. +static unsigned getLoadOpcode(EVT VT) { + switch (VT.getSimpleVT().SimpleTy) { + case MVT::i8: return X86::MOV8rm; + case MVT::i16: return X86::MOV16rm; + case MVT::i32: return X86::MOV32rm; + case MVT::i64: return X86::MOV64rm; + default: + break; + } + llvm_unreachable("Invalid operand size!"); +} + +// Get opcode of the non-atomic one from the specified atomic instruction. +static unsigned getNonAtomicOpcode(unsigned Opc) { + switch (Opc) { + case X86::ATOMAND8: return X86::AND8rr; + case X86::ATOMAND16: return X86::AND16rr; + case X86::ATOMAND32: return X86::AND32rr; + case X86::ATOMAND64: return X86::AND64rr; + case X86::ATOMOR8: return X86::OR8rr; + case X86::ATOMOR16: return X86::OR16rr; + case X86::ATOMOR32: return X86::OR32rr; + case X86::ATOMOR64: return X86::OR64rr; + case X86::ATOMXOR8: return X86::XOR8rr; + case X86::ATOMXOR16: return X86::XOR16rr; + case X86::ATOMXOR32: return X86::XOR32rr; + case X86::ATOMXOR64: return X86::XOR64rr; + } + llvm_unreachable("Unhandled atomic-load-op opcode!"); +} + +// Get opcode of the non-atomic one from the specified atomic instruction with +// extra opcode. +static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc, + unsigned &ExtraOpc) { + switch (Opc) { + case X86::ATOMNAND8: ExtraOpc = X86::NOT8r; return X86::AND8rr; + case X86::ATOMNAND16: ExtraOpc = X86::NOT16r; return X86::AND16rr; + case X86::ATOMNAND32: ExtraOpc = X86::NOT32r; return X86::AND32rr; + case X86::ATOMNAND64: ExtraOpc = X86::NOT64r; return X86::AND64rr; + case X86::ATOMMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVL32rr; + case X86::ATOMMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr; + case X86::ATOMMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr; + case X86::ATOMMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr; + case X86::ATOMMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVG32rr; + case X86::ATOMMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr; + case X86::ATOMMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr; + case X86::ATOMMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr; + case X86::ATOMUMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVB32rr; + case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr; + case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr; + case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr; + case X86::ATOMUMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVA32rr; + case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr; + case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr; + case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr; + } + llvm_unreachable("Unhandled atomic-load-op opcode!"); +} + +// Get opcode of the non-atomic one from the specified atomic instruction for +// 64-bit data type on 32-bit target. +static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) { + switch (Opc) { + case X86::ATOMAND6432: HiOpc = X86::AND32rr; return X86::AND32rr; + case X86::ATOMOR6432: HiOpc = X86::OR32rr; return X86::OR32rr; + case X86::ATOMXOR6432: HiOpc = X86::XOR32rr; return X86::XOR32rr; + case X86::ATOMADD6432: HiOpc = X86::ADC32rr; return X86::ADD32rr; + case X86::ATOMSUB6432: HiOpc = X86::SBB32rr; return X86::SUB32rr; + case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr; + case X86::ATOMMAX6432: HiOpc = X86::SETLr; return X86::SETLr; + case X86::ATOMMIN6432: HiOpc = X86::SETGr; return X86::SETGr; + case X86::ATOMUMAX6432: HiOpc = X86::SETBr; return X86::SETBr; + case X86::ATOMUMIN6432: HiOpc = X86::SETAr; return X86::SETAr; + } + llvm_unreachable("Unhandled atomic-load-op opcode!"); +} + +// Get opcode of the non-atomic one from the specified atomic instruction for +// 64-bit data type on 32-bit target with extra opcode. +static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc, + unsigned &HiOpc, + unsigned &ExtraOpc) { + switch (Opc) { + case X86::ATOMNAND6432: + ExtraOpc = X86::NOT32r; + HiOpc = X86::AND32rr; + return X86::AND32rr; + } + llvm_unreachable("Unhandled atomic-load-op opcode!"); +} + +// Get pseudo CMOV opcode from the specified data type. +static unsigned getPseudoCMOVOpc(EVT VT) { + switch (VT.getSimpleVT().SimpleTy) { + case MVT::i8: return X86::CMOV_GR8; + case MVT::i16: return X86::CMOV_GR16; + case MVT::i32: return X86::CMOV_GR32; + default: + break; + } + llvm_unreachable("Unknown CMOV opcode!"); +} + +// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions. +// They will be translated into a spin-loop or compare-exchange loop from +// +// ... +// dst = atomic-fetch-op MI.addr, MI.val +// ... +// +// to +// +// ... +// t1 = LOAD MI.addr +// loop: +// t4 = phi(t1, t3 / loop) +// t2 = OP MI.val, t4 +// EAX = t4 +// LCMPXCHG [MI.addr], t2, [EAX is implicitly used & defined] +// t3 = EAX +// JNE loop +// sink: +// dst = t3 +// ... +MachineBasicBlock * +X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, + MachineBasicBlock *MBB) const { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + const BasicBlock *BB = MBB->getBasicBlock(); + MachineFunction::iterator I = MBB; + ++I; + + assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 && + "Unexpected number of operands"); + + assert(MI->hasOneMemOperand() && + "Expected atomic-load-op to have one memoperand"); + + // Memory Reference + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + unsigned DstReg, SrcReg; + unsigned MemOpndSlot; + + unsigned CurOp = 0; + + DstReg = MI->getOperand(CurOp++).getReg(); + MemOpndSlot = CurOp; + CurOp += X86::AddrNumOperands; + SrcReg = MI->getOperand(CurOp++).getReg(); + + const TargetRegisterClass *RC = MRI.getRegClass(DstReg); + MVT::SimpleValueType VT = *RC->vt_begin(); + unsigned t1 = MRI.createVirtualRegister(RC); + unsigned t2 = MRI.createVirtualRegister(RC); + unsigned t3 = MRI.createVirtualRegister(RC); + unsigned t4 = MRI.createVirtualRegister(RC); + unsigned PhyReg = getX86SubSuperRegister(X86::EAX, VT); + + unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT); + unsigned LOADOpc = getLoadOpcode(VT); + + // For the atomic load-arith operator, we generate + // + // thisMBB: + // t1 = LOAD [MI.addr] + // mainMBB: + // t4 = phi(t1 / thisMBB, t3 / mainMBB) + // t1 = OP MI.val, EAX + // EAX = t4 + // LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined] + // t3 = EAX + // JNE mainMBB + // sinkMBB: + // dst = t3 + + MachineBasicBlock *thisMBB = MBB; + MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); + MF->insert(I, mainMBB); + MF->insert(I, sinkMBB); + + MachineInstrBuilder MIB; + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), MBB, + llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // thisMBB: + MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); + if (NewMO.isReg()) + NewMO.setIsKill(false); + MIB.addOperand(NewMO); + } + for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) { + unsigned flags = (*MMOI)->getFlags(); + flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad; + MachineMemOperand *MMO = + MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags, + (*MMOI)->getSize(), + (*MMOI)->getBaseAlignment(), + (*MMOI)->getTBAAInfo(), + (*MMOI)->getRanges()); + MIB.addMemOperand(MMO); + } + + thisMBB->addSuccessor(mainMBB); + + // mainMBB: + MachineBasicBlock *origMainMBB = mainMBB; + + // Add a PHI. + MachineInstr *Phi = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4) + .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB); + + unsigned Opc = MI->getOpcode(); + switch (Opc) { + default: + llvm_unreachable("Unhandled atomic-load-op opcode!"); + case X86::ATOMAND8: + case X86::ATOMAND16: + case X86::ATOMAND32: + case X86::ATOMAND64: + case X86::ATOMOR8: + case X86::ATOMOR16: + case X86::ATOMOR32: + case X86::ATOMOR64: + case X86::ATOMXOR8: + case X86::ATOMXOR16: + case X86::ATOMXOR32: + case X86::ATOMXOR64: { + unsigned ARITHOpc = getNonAtomicOpcode(Opc); + BuildMI(mainMBB, DL, TII->get(ARITHOpc), t2).addReg(SrcReg) + .addReg(t4); + break; + } + case X86::ATOMNAND8: + case X86::ATOMNAND16: + case X86::ATOMNAND32: + case X86::ATOMNAND64: { + unsigned Tmp = MRI.createVirtualRegister(RC); + unsigned NOTOpc; + unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc); + BuildMI(mainMBB, DL, TII->get(ANDOpc), Tmp).addReg(SrcReg) + .addReg(t4); + BuildMI(mainMBB, DL, TII->get(NOTOpc), t2).addReg(Tmp); + break; + } + case X86::ATOMMAX8: + case X86::ATOMMAX16: + case X86::ATOMMAX32: + case X86::ATOMMAX64: + case X86::ATOMMIN8: + case X86::ATOMMIN16: + case X86::ATOMMIN32: + case X86::ATOMMIN64: + case X86::ATOMUMAX8: + case X86::ATOMUMAX16: + case X86::ATOMUMAX32: + case X86::ATOMUMAX64: + case X86::ATOMUMIN8: + case X86::ATOMUMIN16: + case X86::ATOMUMIN32: + case X86::ATOMUMIN64: { + unsigned CMPOpc; + unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc); + + BuildMI(mainMBB, DL, TII->get(CMPOpc)) + .addReg(SrcReg) + .addReg(t4); + + if (Subtarget->hasCMov()) { + if (VT != MVT::i8) { + // Native support + BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2) + .addReg(SrcReg) + .addReg(t4); + } else { + // Promote i8 to i32 to use CMOV32 + const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); + const TargetRegisterClass *RC32 = + TRI->getSubClassWithSubReg(getRegClassFor(MVT::i32), X86::sub_8bit); + unsigned SrcReg32 = MRI.createVirtualRegister(RC32); + unsigned AccReg32 = MRI.createVirtualRegister(RC32); + unsigned Tmp = MRI.createVirtualRegister(RC32); + + unsigned Undef = MRI.createVirtualRegister(RC32); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef); + + BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32) + .addReg(Undef) + .addReg(SrcReg) + .addImm(X86::sub_8bit); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32) + .addReg(Undef) + .addReg(t4) + .addImm(X86::sub_8bit); + + BuildMI(mainMBB, DL, TII->get(CMOVOpc), Tmp) + .addReg(SrcReg32) + .addReg(AccReg32); + + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t2) + .addReg(Tmp, 0, X86::sub_8bit); + } + } else { + // Use pseudo select and lower them. + assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) && + "Invalid atomic-load-op transformation!"); + unsigned SelOpc = getPseudoCMOVOpc(VT); + X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc); + assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!"); + MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t2) + .addReg(SrcReg).addReg(t4) + .addImm(CC); + mainMBB = EmitLoweredSelect(MIB, mainMBB); + // Replace the original PHI node as mainMBB is changed after CMOV + // lowering. + BuildMI(*origMainMBB, Phi, DL, TII->get(X86::PHI), t4) + .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB); + Phi->eraseFromParent(); + } + break; + } + } + + // Copy PhyReg back from virtual register. + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), PhyReg) + .addReg(t4); + + MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); + if (NewMO.isReg()) + NewMO.setIsKill(false); + MIB.addOperand(NewMO); + } + MIB.addReg(t2); + MIB.setMemRefs(MMOBegin, MMOEnd); + + // Copy PhyReg back to virtual register. + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3) + .addReg(PhyReg); + + BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); + + mainMBB->addSuccessor(origMainMBB); + mainMBB->addSuccessor(sinkMBB); + + // sinkMBB: + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(TargetOpcode::COPY), DstReg) + .addReg(t3); + + MI->eraseFromParent(); + return sinkMBB; +} + +// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic +// instructions. They will be translated into a spin-loop or compare-exchange +// loop from +// +// ... +// dst = atomic-fetch-op MI.addr, MI.val +// ... +// +// to +// +// ... +// t1L = LOAD [MI.addr + 0] +// t1H = LOAD [MI.addr + 4] +// loop: +// t4L = phi(t1L, t3L / loop) +// t4H = phi(t1H, t3H / loop) +// t2L = OP MI.val.lo, t4L +// t2H = OP MI.val.hi, t4H +// EAX = t4L +// EDX = t4H +// EBX = t2L +// ECX = t2H +// LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] +// t3L = EAX +// t3H = EDX +// JNE loop +// sink: +// dstL = t3L +// dstH = t3H +// ... +MachineBasicBlock * +X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, + MachineBasicBlock *MBB) const { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + const BasicBlock *BB = MBB->getBasicBlock(); + MachineFunction::iterator I = MBB; + ++I; + + assert(MI->getNumOperands() <= X86::AddrNumOperands + 7 && + "Unexpected number of operands"); + + assert(MI->hasOneMemOperand() && + "Expected atomic-load-op32 to have one memoperand"); + + // Memory Reference + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + unsigned DstLoReg, DstHiReg; + unsigned SrcLoReg, SrcHiReg; + unsigned MemOpndSlot; + + unsigned CurOp = 0; + + DstLoReg = MI->getOperand(CurOp++).getReg(); + DstHiReg = MI->getOperand(CurOp++).getReg(); + MemOpndSlot = CurOp; + CurOp += X86::AddrNumOperands; + SrcLoReg = MI->getOperand(CurOp++).getReg(); + SrcHiReg = MI->getOperand(CurOp++).getReg(); + + const TargetRegisterClass *RC = &X86::GR32RegClass; + const TargetRegisterClass *RC8 = &X86::GR8RegClass; + + unsigned t1L = MRI.createVirtualRegister(RC); + unsigned t1H = MRI.createVirtualRegister(RC); + unsigned t2L = MRI.createVirtualRegister(RC); + unsigned t2H = MRI.createVirtualRegister(RC); + unsigned t3L = MRI.createVirtualRegister(RC); + unsigned t3H = MRI.createVirtualRegister(RC); + unsigned t4L = MRI.createVirtualRegister(RC); + unsigned t4H = MRI.createVirtualRegister(RC); + + unsigned LCMPXCHGOpc = X86::LCMPXCHG8B; + unsigned LOADOpc = X86::MOV32rm; + + // For the atomic load-arith operator, we generate + // + // thisMBB: + // t1L = LOAD [MI.addr + 0] + // t1H = LOAD [MI.addr + 4] + // mainMBB: + // t4L = phi(t1L / thisMBB, t3L / mainMBB) + // t4H = phi(t1H / thisMBB, t3H / mainMBB) + // t2L = OP MI.val.lo, t4L + // t2H = OP MI.val.hi, t4H + // EBX = t2L + // ECX = t2H + // LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] + // t3L = EAX + // t3H = EDX + // JNE loop + // sinkMBB: + // dstL = t3L + // dstH = t3H + + MachineBasicBlock *thisMBB = MBB; + MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); + MF->insert(I, mainMBB); + MF->insert(I, sinkMBB); + + MachineInstrBuilder MIB; + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), MBB, + llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // thisMBB: + // Lo + MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1L); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); + if (NewMO.isReg()) + NewMO.setIsKill(false); + MIB.addOperand(NewMO); + } + for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) { + unsigned flags = (*MMOI)->getFlags(); + flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad; + MachineMemOperand *MMO = + MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags, + (*MMOI)->getSize(), + (*MMOI)->getBaseAlignment(), + (*MMOI)->getTBAAInfo(), + (*MMOI)->getRanges()); + MIB.addMemOperand(MMO); + }; + MachineInstr *LowMI = MIB; + + // Hi + MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1H); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + if (i == X86::AddrDisp) { + MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32) + } else { + MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); + if (NewMO.isReg()) + NewMO.setIsKill(false); + MIB.addOperand(NewMO); + } + } + MIB.setMemRefs(LowMI->memoperands_begin(), LowMI->memoperands_end()); + + thisMBB->addSuccessor(mainMBB); + + // mainMBB: + MachineBasicBlock *origMainMBB = mainMBB; + + // Add PHIs. + MachineInstr *PhiL = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4L) + .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB); + MachineInstr *PhiH = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4H) + .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB); + + unsigned Opc = MI->getOpcode(); + switch (Opc) { + default: + llvm_unreachable("Unhandled atomic-load-op6432 opcode!"); + case X86::ATOMAND6432: + case X86::ATOMOR6432: + case X86::ATOMXOR6432: + case X86::ATOMADD6432: + case X86::ATOMSUB6432: { + unsigned HiOpc; + unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); + BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(t4L) + .addReg(SrcLoReg); + BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(t4H) + .addReg(SrcHiReg); + break; + } + case X86::ATOMNAND6432: { + unsigned HiOpc, NOTOpc; + unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc); + unsigned TmpL = MRI.createVirtualRegister(RC); + unsigned TmpH = MRI.createVirtualRegister(RC); + BuildMI(mainMBB, DL, TII->get(LoOpc), TmpL).addReg(SrcLoReg) + .addReg(t4L); + BuildMI(mainMBB, DL, TII->get(HiOpc), TmpH).addReg(SrcHiReg) + .addReg(t4H); + BuildMI(mainMBB, DL, TII->get(NOTOpc), t2L).addReg(TmpL); + BuildMI(mainMBB, DL, TII->get(NOTOpc), t2H).addReg(TmpH); + break; + } + case X86::ATOMMAX6432: + case X86::ATOMMIN6432: + case X86::ATOMUMAX6432: + case X86::ATOMUMIN6432: { + unsigned HiOpc; + unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); + unsigned cL = MRI.createVirtualRegister(RC8); + unsigned cH = MRI.createVirtualRegister(RC8); + unsigned cL32 = MRI.createVirtualRegister(RC); + unsigned cH32 = MRI.createVirtualRegister(RC); + unsigned cc = MRI.createVirtualRegister(RC); + // cl := cmp src_lo, lo + BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) + .addReg(SrcLoReg).addReg(t4L); + BuildMI(mainMBB, DL, TII->get(LoOpc), cL); + BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL); + // ch := cmp src_hi, hi + BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) + .addReg(SrcHiReg).addReg(t4H); + BuildMI(mainMBB, DL, TII->get(HiOpc), cH); + BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH); + // cc := if (src_hi == hi) ? cl : ch; + if (Subtarget->hasCMov()) { + BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc) + .addReg(cH32).addReg(cL32); + } else { + MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc) + .addReg(cH32).addReg(cL32) + .addImm(X86::COND_E); + mainMBB = EmitLoweredSelect(MIB, mainMBB); + } + BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc); + if (Subtarget->hasCMov()) { + BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2L) + .addReg(SrcLoReg).addReg(t4L); + BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2H) + .addReg(SrcHiReg).addReg(t4H); + } else { + MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2L) + .addReg(SrcLoReg).addReg(t4L) + .addImm(X86::COND_NE); + mainMBB = EmitLoweredSelect(MIB, mainMBB); + // As the lowered CMOV won't clobber EFLAGS, we could reuse it for the + // 2nd CMOV lowering. + mainMBB->addLiveIn(X86::EFLAGS); + MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2H) + .addReg(SrcHiReg).addReg(t4H) + .addImm(X86::COND_NE); + mainMBB = EmitLoweredSelect(MIB, mainMBB); + // Replace the original PHI node as mainMBB is changed after CMOV + // lowering. + BuildMI(*origMainMBB, PhiL, DL, TII->get(X86::PHI), t4L) + .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB); + BuildMI(*origMainMBB, PhiH, DL, TII->get(X86::PHI), t4H) + .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB); + PhiL->eraseFromParent(); + PhiH->eraseFromParent(); + } + break; + } + case X86::ATOMSWAP6432: { + unsigned HiOpc; + unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); + BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg); + BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg); + break; + } + } + + // Copy EDX:EAX back from HiReg:LoReg + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(t4L); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(t4H); + // Copy ECX:EBX from t1H:t1L + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t2L); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t2H); + + MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); + if (NewMO.isReg()) + NewMO.setIsKill(false); + MIB.addOperand(NewMO); + } + MIB.setMemRefs(MMOBegin, MMOEnd); + + // Copy EDX:EAX back to t3H:t3L + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3L).addReg(X86::EAX); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3H).addReg(X86::EDX); + + BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); + + mainMBB->addSuccessor(origMainMBB); + mainMBB->addSuccessor(sinkMBB); + + // sinkMBB: + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(TargetOpcode::COPY), DstLoReg) + .addReg(t3L); + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(TargetOpcode::COPY), DstHiReg) + .addReg(t3H); + + MI->eraseFromParent(); + return sinkMBB; +} + +// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 +// or XMM0_V32I8 in AVX all of this code can be replaced with that +// in the .td file. +static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB, + const TargetInstrInfo *TII) { + unsigned Opc; + switch (MI->getOpcode()) { + default: llvm_unreachable("illegal opcode!"); + case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break; + case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break; + case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break; + case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break; + case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break; + case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break; + case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break; + case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break; + } + + DebugLoc dl = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); + + unsigned NumArgs = MI->getNumOperands(); + for (unsigned i = 1; i < NumArgs; ++i) { + MachineOperand &Op = MI->getOperand(i); + if (!(Op.isReg() && Op.isImplicit())) + MIB.addOperand(Op); + } + if (MI->hasOneMemOperand()) + MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + + BuildMI(*BB, MI, dl, + TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) + .addReg(X86::XMM0); + + MI->eraseFromParent(); + return BB; +} + +// FIXME: Custom handling because TableGen doesn't support multiple implicit +// defs in an instruction pattern +static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB, + const TargetInstrInfo *TII) { + unsigned Opc; + switch (MI->getOpcode()) { + default: llvm_unreachable("illegal opcode!"); + case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break; + case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break; + case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break; + case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break; + case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break; + case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break; + case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break; + case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break; + } + + DebugLoc dl = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); + + unsigned NumArgs = MI->getNumOperands(); // remove the results + for (unsigned i = 1; i < NumArgs; ++i) { + MachineOperand &Op = MI->getOperand(i); + if (!(Op.isReg() && Op.isImplicit())) + MIB.addOperand(Op); + } + if (MI->hasOneMemOperand()) + MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + + BuildMI(*BB, MI, dl, + TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) + .addReg(X86::ECX); + + MI->eraseFromParent(); + return BB; +} + +static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, + const TargetInstrInfo *TII, + const X86Subtarget* Subtarget) { + DebugLoc dl = MI->getDebugLoc(); + + // Address into RAX/EAX, other two args into ECX, EDX. + unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; + unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); + for (int i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MI->getOperand(i)); + + unsigned ValOps = X86::AddrNumOperands; + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) + .addReg(MI->getOperand(ValOps).getReg()); + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) + .addReg(MI->getOperand(ValOps+1).getReg()); + + // The instruction doesn't actually take any operands though. + BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; +} + +MachineBasicBlock * +X86TargetLowering::EmitVAARG64WithCustomInserter( + MachineInstr *MI, + MachineBasicBlock *MBB) const { + // Emit va_arg instruction on X86-64. + + // Operands to this pseudo-instruction: + // 0 ) Output : destination address (reg) + // 1-5) Input : va_list address (addr, i64mem) + // 6 ) ArgSize : Size (in bytes) of vararg type + // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset + // 8 ) Align : Alignment of type + // 9 ) EFLAGS (implicit-def) + + assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); + assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); + + unsigned DestReg = MI->getOperand(0).getReg(); + MachineOperand &Base = MI->getOperand(1); + MachineOperand &Scale = MI->getOperand(2); + MachineOperand &Index = MI->getOperand(3); + MachineOperand &Disp = MI->getOperand(4); + MachineOperand &Segment = MI->getOperand(5); + unsigned ArgSize = MI->getOperand(6).getImm(); + unsigned ArgMode = MI->getOperand(7).getImm(); + unsigned Align = MI->getOperand(8).getImm(); + + // Memory Reference + assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + // Machine Information + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); + const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); + DebugLoc DL = MI->getDebugLoc(); + + // struct va_list { + // i32 gp_offset + // i32 fp_offset + // i64 overflow_area (address) + // i64 reg_save_area (address) + // } + // sizeof(va_list) = 24 + // alignment(va_list) = 8 + + unsigned TotalNumIntRegs = 6; + unsigned TotalNumXMMRegs = 8; + bool UseGPOffset = (ArgMode == 1); + bool UseFPOffset = (ArgMode == 2); + unsigned MaxOffset = TotalNumIntRegs * 8 + + (UseFPOffset ? TotalNumXMMRegs * 16 : 0); + + /* Align ArgSize to a multiple of 8 */ + unsigned ArgSizeA8 = (ArgSize + 7) & ~7; + bool NeedsAlign = (Align > 8); + + MachineBasicBlock *thisMBB = MBB; + MachineBasicBlock *overflowMBB; + MachineBasicBlock *offsetMBB; + MachineBasicBlock *endMBB; + + unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB + unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB + unsigned OffsetReg = 0; + + if (!UseGPOffset && !UseFPOffset) { + // If we only pull from the overflow region, we don't create a branch. + // We don't need to alter control flow. + OffsetDestReg = 0; // unused + OverflowDestReg = DestReg; + + offsetMBB = NULL; + overflowMBB = thisMBB; + endMBB = thisMBB; + } else { + // First emit code to check if gp_offset (or fp_offset) is below the bound. + // If so, pull the argument from reg_save_area. (branch to offsetMBB) + // If not, pull from overflow_area. (branch to overflowMBB) + // + // thisMBB + // | . + // | . + // offsetMBB overflowMBB + // | . + // | . + // endMBB + + // Registers for the PHI in endMBB + OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); + OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); + + const BasicBlock *LLVM_BB = MBB->getBasicBlock(); + MachineFunction *MF = MBB->getParent(); + overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); + offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); + endMBB = MF->CreateMachineBasicBlock(LLVM_BB); + + MachineFunction::iterator MBBIter = MBB; + ++MBBIter; + + // Insert the new basic blocks + MF->insert(MBBIter, offsetMBB); + MF->insert(MBBIter, overflowMBB); + MF->insert(MBBIter, endMBB); + + // Transfer the remainder of MBB and its successor edges to endMBB. + endMBB->splice(endMBB->begin(), thisMBB, + llvm::next(MachineBasicBlock::iterator(MI)), + thisMBB->end()); + endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); + + // Make offsetMBB and overflowMBB successors of thisMBB + thisMBB->addSuccessor(offsetMBB); + thisMBB->addSuccessor(overflowMBB); + + // endMBB is a successor of both offsetMBB and overflowMBB + offsetMBB->addSuccessor(endMBB); + overflowMBB->addSuccessor(endMBB); + + // Load the offset value into a register + OffsetReg = MRI.createVirtualRegister(OffsetRegClass); + BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) + .addOperand(Base) + .addOperand(Scale) + .addOperand(Index) + .addDisp(Disp, UseFPOffset ? 4 : 0) + .addOperand(Segment) + .setMemRefs(MMOBegin, MMOEnd); + + // Check if there is enough room left to pull this argument. + BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) + .addReg(OffsetReg) + .addImm(MaxOffset + 8 - ArgSizeA8); + + // Branch to "overflowMBB" if offset >= max + // Fall through to "offsetMBB" otherwise + BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) + .addMBB(overflowMBB); + } + + // In offsetMBB, emit code to use the reg_save_area. + if (offsetMBB) { + assert(OffsetReg != 0); + + // Read the reg_save_area address. + unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); + BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) + .addOperand(Base) + .addOperand(Scale) + .addOperand(Index) + .addDisp(Disp, 16) + .addOperand(Segment) + .setMemRefs(MMOBegin, MMOEnd); + + // Zero-extend the offset + unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); + BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) + .addImm(0) + .addReg(OffsetReg) + .addImm(X86::sub_32bit); + + // Add the offset to the reg_save_area to get the final address. + BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) + .addReg(OffsetReg64) + .addReg(RegSaveReg); + + // Compute the offset for the next argument + unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); + BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) + .addReg(OffsetReg) + .addImm(UseFPOffset ? 16 : 8); + + // Store it back into the va_list. + BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) + .addOperand(Base) + .addOperand(Scale) + .addOperand(Index) + .addDisp(Disp, UseFPOffset ? 4 : 0) + .addOperand(Segment) + .addReg(NextOffsetReg) + .setMemRefs(MMOBegin, MMOEnd); + + // Jump to endMBB + BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) + .addMBB(endMBB); + } + + // + // Emit code to use overflow area + // + + // Load the overflow_area address into a register. + unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); + BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) + .addOperand(Base) + .addOperand(Scale) + .addOperand(Index) + .addDisp(Disp, 8) + .addOperand(Segment) + .setMemRefs(MMOBegin, MMOEnd); + + // If we need to align it, do so. Otherwise, just copy the address + // to OverflowDestReg. + if (NeedsAlign) { + // Align the overflow address + assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); + unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); + + // aligned_addr = (addr + (align-1)) & ~(align-1) + BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) + .addReg(OverflowAddrReg) + .addImm(Align-1); + + BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) + .addReg(TmpReg) + .addImm(~(uint64_t)(Align-1)); + } else { + BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) + .addReg(OverflowAddrReg); + } + + // Compute the next overflow address after this argument. + // (the overflow address should be kept 8-byte aligned) + unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); + BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) + .addReg(OverflowDestReg) + .addImm(ArgSizeA8); + + // Store the new overflow address. + BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) + .addOperand(Base) + .addOperand(Scale) + .addOperand(Index) + .addDisp(Disp, 8) + .addOperand(Segment) + .addReg(NextAddrReg) + .setMemRefs(MMOBegin, MMOEnd); + + // If we branched, emit the PHI to the front of endMBB. + if (offsetMBB) { + BuildMI(*endMBB, endMBB->begin(), DL, + TII->get(X86::PHI), DestReg) + .addReg(OffsetDestReg).addMBB(offsetMBB) + .addReg(OverflowDestReg).addMBB(overflowMBB); + } + + // Erase the pseudo instruction + MI->eraseFromParent(); + + return endMBB; +} + +MachineBasicBlock * +X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( + MachineInstr *MI, + MachineBasicBlock *MBB) const { + // Emit code to save XMM registers to the stack. The ABI says that the + // number of registers to save is given in %al, so it's theoretically + // possible to do an indirect jump trick to avoid saving all of them, + // however this code takes a simpler approach and just executes all + // of the stores if %al is non-zero. It's less code, and it's probably + // easier on the hardware branch predictor, and stores aren't all that + // expensive anyway. + + // Create the new basic blocks. One block contains all the XMM stores, + // and one block is the final destination regardless of whether any + // stores were performed. + const BasicBlock *LLVM_BB = MBB->getBasicBlock(); + MachineFunction *F = MBB->getParent(); + MachineFunction::iterator MBBIter = MBB; + ++MBBIter; + MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(MBBIter, XMMSaveMBB); + F->insert(MBBIter, EndMBB); + + // Transfer the remainder of MBB and its successor edges to EndMBB. + EndMBB->splice(EndMBB->begin(), MBB, + llvm::next(MachineBasicBlock::iterator(MI)), + MBB->end()); + EndMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // The original block will now fall through to the XMM save block. + MBB->addSuccessor(XMMSaveMBB); + // The XMMSaveMBB will fall through to the end block. + XMMSaveMBB->addSuccessor(EndMBB); + + // Now add the instructions. + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + + unsigned CountReg = MI->getOperand(0).getReg(); + int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); + int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); + + if (!Subtarget->isTargetWin64()) { + // If %al is 0, branch around the XMM save block. + BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); + BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); + MBB->addSuccessor(EndMBB); + } + + unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr; + // In the XMM save block, save all the XMM argument registers. + for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { + int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; + MachineMemOperand *MMO = + F->getMachineMemOperand( + MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), + MachineMemOperand::MOStore, + /*Size=*/16, /*Align=*/16); + BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) + .addFrameIndex(RegSaveFrameIndex) + .addImm(/*Scale=*/1) + .addReg(/*IndexReg=*/0) + .addImm(/*Disp=*/Offset) + .addReg(/*Segment=*/0) + .addReg(MI->getOperand(i).getReg()) + .addMemOperand(MMO); + } + + MI->eraseFromParent(); // The pseudo instruction is gone now. + + return EndMBB; +} + +// The EFLAGS operand of SelectItr might be missing a kill marker +// because there were multiple uses of EFLAGS, and ISel didn't know +// which to mark. Figure out whether SelectItr should have had a +// kill marker, and set it if it should. Returns the correct kill +// marker value. +static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, + MachineBasicBlock* BB, + const TargetRegisterInfo* TRI) { + // Scan forward through BB for a use/def of EFLAGS. + MachineBasicBlock::iterator miI(llvm::next(SelectItr)); + for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { + const MachineInstr& mi = *miI; + if (mi.readsRegister(X86::EFLAGS)) + return false; + if (mi.definesRegister(X86::EFLAGS)) + break; // Should have kill-flag - update below. + } + + // If we hit the end of the block, check whether EFLAGS is live into a + // successor. + if (miI == BB->end()) { + for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), + sEnd = BB->succ_end(); + sItr != sEnd; ++sItr) { + MachineBasicBlock* succ = *sItr; + if (succ->isLiveIn(X86::EFLAGS)) + return false; + } + } + + // We found a def, or hit the end of the basic block and EFLAGS wasn't live + // out. SelectMI should have a kill flag on EFLAGS. + SelectItr->addRegisterKilled(X86::EFLAGS, TRI); + return true; +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, + MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + + // To "insert" a SELECT_CC instruction, we actually have to insert the + // diamond control-flow pattern. The incoming instruction knows the + // destination vreg to set, the condition code register to branch on, the + // true/false values to select between, and a branch opcode to use. + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction::iterator It = BB; + ++It; + + // thisMBB: + // ... + // TrueVal = ... + // cmpTY ccX, r1, r2 + // bCC copy1MBB + // fallthrough --> copy0MBB + MachineBasicBlock *thisMBB = BB; + MachineFunction *F = BB->getParent(); + MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(It, copy0MBB); + F->insert(It, sinkMBB); + + // If the EFLAGS register isn't dead in the terminator, then claim that it's + // live into the sink and copy blocks. + const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); + if (!MI->killsRegister(X86::EFLAGS) && + !checkAndUpdateEFLAGSKill(MI, BB, TRI)) { + copy0MBB->addLiveIn(X86::EFLAGS); + sinkMBB->addLiveIn(X86::EFLAGS); + } + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(BB); + + // Add the true and fallthrough blocks as its successors. + BB->addSuccessor(copy0MBB); + BB->addSuccessor(sinkMBB); + + // Create the conditional branch instruction. + unsigned Opc = + X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); + BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); + + // copy0MBB: + // %FalseValue = ... + // # fallthrough to sinkMBB + copy0MBB->addSuccessor(sinkMBB); + + // sinkMBB: + // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] + // ... + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(X86::PHI), MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) + .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + + MI->eraseFromParent(); // The pseudo instruction is gone now. + return sinkMBB; +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, + bool Is64Bit) const { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + MachineFunction *MF = BB->getParent(); + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + + assert(getTargetMachine().Options.EnableSegmentedStacks); + + unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; + unsigned TlsOffset = Is64Bit ? 0x70 : 0x30; + + // BB: + // ... [Till the alloca] + // If stacklet is not large enough, jump to mallocMBB + // + // bumpMBB: + // Allocate by subtracting from RSP + // Jump to continueMBB + // + // mallocMBB: + // Allocate by call to runtime + // + // continueMBB: + // ... + // [rest of original BB] + // + + MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB); + + MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetRegisterClass *AddrRegClass = + getRegClassFor(Is64Bit ? MVT::i64:MVT::i32); + + unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), + bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), + tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), + SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), + sizeVReg = MI->getOperand(1).getReg(), + physSPReg = Is64Bit ? X86::RSP : X86::ESP; + + MachineFunction::iterator MBBIter = BB; + ++MBBIter; + + MF->insert(MBBIter, bumpMBB); + MF->insert(MBBIter, mallocMBB); + MF->insert(MBBIter, continueMBB); + + continueMBB->splice(continueMBB->begin(), BB, llvm::next + (MachineBasicBlock::iterator(MI)), BB->end()); + continueMBB->transferSuccessorsAndUpdatePHIs(BB); + + // Add code to the main basic block to check if the stack limit has been hit, + // and if so, jump to mallocMBB otherwise to bumpMBB. + BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); + BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) + .addReg(tmpSPVReg).addReg(sizeVReg); + BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr)) + .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) + .addReg(SPLimitVReg); + BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB); + + // bumpMBB simply decreases the stack pointer, since we know the current + // stacklet has enough space. + BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) + .addReg(SPLimitVReg); + BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) + .addReg(SPLimitVReg); + BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); + + // Calls into a routine in libgcc to allocate more space from the heap. + const uint32_t *RegMask = + getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); + if (Is64Bit) { + BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) + .addReg(sizeVReg); + BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) + .addExternalSymbol("__morestack_allocate_stack_space") + .addRegMask(RegMask) + .addReg(X86::RDI, RegState::Implicit) + .addReg(X86::RAX, RegState::ImplicitDefine); + } else { + BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) + .addImm(12); + BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); + BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) + .addExternalSymbol("__morestack_allocate_stack_space") + .addRegMask(RegMask) + .addReg(X86::EAX, RegState::ImplicitDefine); + } + + if (!Is64Bit) + BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg) + .addImm(16); + + BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) + .addReg(Is64Bit ? X86::RAX : X86::EAX); + BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); + + // Set up the CFG correctly. + BB->addSuccessor(bumpMBB); + BB->addSuccessor(mallocMBB); + mallocMBB->addSuccessor(continueMBB); + bumpMBB->addSuccessor(continueMBB); + + // Take care of the PHI nodes. + BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI), + MI->getOperand(0).getReg()) + .addReg(mallocPtrVReg).addMBB(mallocMBB) + .addReg(bumpSPPtrVReg).addMBB(bumpMBB); + + // Delete the original pseudo instruction. + MI->eraseFromParent(); + + // And we're done. + return continueMBB; +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, + MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + + assert(!Subtarget->isTargetEnvMacho()); + + // The lowering is pretty easy: we're just emitting the call to _alloca. The + // non-trivial part is impdef of ESP. + + if (Subtarget->isTargetWin64()) { + if (Subtarget->isTargetCygMing()) { + // ___chkstk(Mingw64): + // Clobbers R10, R11, RAX and EFLAGS. + // Updates RSP. + BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) + .addExternalSymbol("___chkstk") + .addReg(X86::RAX, RegState::Implicit) + .addReg(X86::RSP, RegState::Implicit) + .addReg(X86::RAX, RegState::Define | RegState::Implicit) + .addReg(X86::RSP, RegState::Define | RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); + } else { + // __chkstk(MSVCRT): does not update stack pointer. + // Clobbers R10, R11 and EFLAGS. + // FIXME: RAX(allocated size) might be reused and not killed. + BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) + .addExternalSymbol("__chkstk") + .addReg(X86::RAX, RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); + // RAX has the offset to subtracted from RSP. + BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) + .addReg(X86::RSP) + .addReg(X86::RAX); + } + } else { + const char *StackProbeSymbol = + Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; + + BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) + .addExternalSymbol(StackProbeSymbol) + .addReg(X86::EAX, RegState::Implicit) + .addReg(X86::ESP, RegState::Implicit) + .addReg(X86::EAX, RegState::Define | RegState::Implicit) + .addReg(X86::ESP, RegState::Define | RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); + } + + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, + MachineBasicBlock *BB) const { + // This is pretty easy. We're taking the value that we received from + // our load from the relocation, sticking it in either RDI (x86-64) + // or EAX and doing an indirect call. The return value will then + // be in the normal return register. + const X86InstrInfo *TII + = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); + DebugLoc DL = MI->getDebugLoc(); + MachineFunction *F = BB->getParent(); + + assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); + assert(MI->getOperand(3).isGlobal() && "This should be a global"); + + // Get a register mask for the lowered call. + // FIXME: The 32-bit calls have non-standard calling conventions. Use a + // proper register mask. + const uint32_t *RegMask = + getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); + if (Subtarget->is64Bit()) { + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, + TII->get(X86::MOV64rm), X86::RDI) + .addReg(X86::RIP) + .addImm(0).addReg(0) + .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, + MI->getOperand(3).getTargetFlags()) + .addReg(0); + MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); + addDirectMem(MIB, X86::RDI); + MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); + } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, + TII->get(X86::MOV32rm), X86::EAX) + .addReg(0) + .addImm(0).addReg(0) + .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, + MI->getOperand(3).getTargetFlags()) + .addReg(0); + MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); + addDirectMem(MIB, X86::EAX); + MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); + } else { + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, + TII->get(X86::MOV32rm), X86::EAX) + .addReg(TII->getGlobalBaseReg(F)) + .addImm(0).addReg(0) + .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, + MI->getOperand(3).getTargetFlags()) + .addReg(0); + MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); + addDirectMem(MIB, X86::EAX); + MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); + } + + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; +} + +MachineBasicBlock * +X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + const BasicBlock *BB = MBB->getBasicBlock(); + MachineFunction::iterator I = MBB; + ++I; + + // Memory Reference + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + unsigned DstReg; + unsigned MemOpndSlot = 0; + + unsigned CurOp = 0; + + DstReg = MI->getOperand(CurOp++).getReg(); + const TargetRegisterClass *RC = MRI.getRegClass(DstReg); + assert(RC->hasType(MVT::i32) && "Invalid destination!"); + unsigned mainDstReg = MRI.createVirtualRegister(RC); + unsigned restoreDstReg = MRI.createVirtualRegister(RC); + + MemOpndSlot = CurOp; + + MVT PVT = getPointerTy(); + assert((PVT == MVT::i64 || PVT == MVT::i32) && + "Invalid Pointer Size!"); + + // For v = setjmp(buf), we generate + // + // thisMBB: + // buf[LabelOffset] = restoreMBB + // SjLjSetup restoreMBB + // + // mainMBB: + // v_main = 0 + // + // sinkMBB: + // v = phi(main, restore) + // + // restoreMBB: + // v_restore = 1 + + MachineBasicBlock *thisMBB = MBB; + MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB); + MF->insert(I, mainMBB); + MF->insert(I, sinkMBB); + MF->push_back(restoreMBB); + + MachineInstrBuilder MIB; + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), MBB, + llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // thisMBB: + unsigned PtrStoreOpc = 0; + unsigned LabelReg = 0; + const int64_t LabelOffset = 1 * PVT.getStoreSize(); + Reloc::Model RM = getTargetMachine().getRelocationModel(); + bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) && + (RM == Reloc::Static || RM == Reloc::DynamicNoPIC); + + // Prepare IP either in reg or imm. + if (!UseImmLabel) { + PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; + const TargetRegisterClass *PtrRC = getRegClassFor(PVT); + LabelReg = MRI.createVirtualRegister(PtrRC); + if (Subtarget->is64Bit()) { + MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg) + .addReg(X86::RIP) + .addImm(0) + .addReg(0) + .addMBB(restoreMBB) + .addReg(0); + } else { + const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII); + MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg) + .addReg(XII->getGlobalBaseReg(MF)) + .addImm(0) + .addReg(0) + .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference()) + .addReg(0); + } + } else + PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; + // Store IP + MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc)); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + if (i == X86::AddrDisp) + MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset); + else + MIB.addOperand(MI->getOperand(MemOpndSlot + i)); + } + if (!UseImmLabel) + MIB.addReg(LabelReg); + else + MIB.addMBB(restoreMBB); + MIB.setMemRefs(MMOBegin, MMOEnd); + // Setup + MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) + .addMBB(restoreMBB); + MIB.addRegMask(RegInfo->getNoPreservedMask()); + thisMBB->addSuccessor(mainMBB); + thisMBB->addSuccessor(restoreMBB); + + // mainMBB: + // EAX = 0 + BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg); + mainMBB->addSuccessor(sinkMBB); + + // sinkMBB: + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(X86::PHI), DstReg) + .addReg(mainDstReg).addMBB(mainMBB) + .addReg(restoreDstReg).addMBB(restoreMBB); + + // restoreMBB: + BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1); + BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB); + restoreMBB->addSuccessor(sinkMBB); + + MI->eraseFromParent(); + return sinkMBB; +} + +MachineBasicBlock * +X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + // Memory Reference + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + MVT PVT = getPointerTy(); + assert((PVT == MVT::i64 || PVT == MVT::i32) && + "Invalid Pointer Size!"); + + const TargetRegisterClass *RC = + (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; + unsigned Tmp = MRI.createVirtualRegister(RC); + // Since FP is only updated here but NOT referenced, it's treated as GPR. + unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; + unsigned SP = RegInfo->getStackRegister(); + + MachineInstrBuilder MIB; + + const int64_t LabelOffset = 1 * PVT.getStoreSize(); + const int64_t SPOffset = 2 * PVT.getStoreSize(); + + unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; + unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r; + + // Reload FP + MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MI->getOperand(i)); + MIB.setMemRefs(MMOBegin, MMOEnd); + // Reload IP + MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + if (i == X86::AddrDisp) + MIB.addDisp(MI->getOperand(i), LabelOffset); + else + MIB.addOperand(MI->getOperand(i)); + } + MIB.setMemRefs(MMOBegin, MMOEnd); + // Reload SP + MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + if (i == X86::AddrDisp) + MIB.addDisp(MI->getOperand(i), SPOffset); + else + MIB.addOperand(MI->getOperand(i)); + } + MIB.setMemRefs(MMOBegin, MMOEnd); + // Jump + BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp); + + MI->eraseFromParent(); + return MBB; +} + +MachineBasicBlock * +X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock *BB) const { + switch (MI->getOpcode()) { + default: llvm_unreachable("Unexpected instr type to insert"); + case X86::TAILJMPd64: + case X86::TAILJMPr64: + case X86::TAILJMPm64: + llvm_unreachable("TAILJMP64 would not be touched here."); + case X86::TCRETURNdi64: + case X86::TCRETURNri64: + case X86::TCRETURNmi64: + return BB; + case X86::WIN_ALLOCA: + return EmitLoweredWinAlloca(MI, BB); + case X86::SEG_ALLOCA_32: + return EmitLoweredSegAlloca(MI, BB, false); + case X86::SEG_ALLOCA_64: + return EmitLoweredSegAlloca(MI, BB, true); + case X86::TLSCall_32: + case X86::TLSCall_64: + return EmitLoweredTLSCall(MI, BB); + case X86::CMOV_GR8: + case X86::CMOV_FR32: + case X86::CMOV_FR64: + case X86::CMOV_V4F32: + case X86::CMOV_V2F64: + case X86::CMOV_V2I64: + case X86::CMOV_V8F32: + case X86::CMOV_V4F64: + case X86::CMOV_V4I64: + case X86::CMOV_GR16: + case X86::CMOV_GR32: + case X86::CMOV_RFP32: + case X86::CMOV_RFP64: + case X86::CMOV_RFP80: + return EmitLoweredSelect(MI, BB); + + case X86::FP32_TO_INT16_IN_MEM: + case X86::FP32_TO_INT32_IN_MEM: + case X86::FP32_TO_INT64_IN_MEM: + case X86::FP64_TO_INT16_IN_MEM: + case X86::FP64_TO_INT32_IN_MEM: + case X86::FP64_TO_INT64_IN_MEM: + case X86::FP80_TO_INT16_IN_MEM: + case X86::FP80_TO_INT32_IN_MEM: + case X86::FP80_TO_INT64_IN_MEM: { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + + // Change the floating point control register to use "round towards zero" + // mode when truncating to an integer value. + MachineFunction *F = BB->getParent(); + int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); + addFrameReference(BuildMI(*BB, MI, DL, + TII->get(X86::FNSTCW16m)), CWFrameIdx); + + // Load the old value of the high byte of the control word... + unsigned OldCW = + F->getRegInfo().createVirtualRegister(&X86::GR16RegClass); + addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), + CWFrameIdx); + + // Set the high part to be round to zero... + addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) + .addImm(0xC7F); + + // Reload the modified control word now... + addFrameReference(BuildMI(*BB, MI, DL, + TII->get(X86::FLDCW16m)), CWFrameIdx); + + // Restore the memory image of control word to original value + addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) + .addReg(OldCW); + + // Get the X86 opcode to use. + unsigned Opc; + switch (MI->getOpcode()) { + default: llvm_unreachable("illegal opcode!"); + case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; + case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; + case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; + case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; + case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; + case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; + case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; + case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; + case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; + } + + X86AddressMode AM; + MachineOperand &Op = MI->getOperand(0); + if (Op.isReg()) { + AM.BaseType = X86AddressMode::RegBase; + AM.Base.Reg = Op.getReg(); + } else { + AM.BaseType = X86AddressMode::FrameIndexBase; + AM.Base.FrameIndex = Op.getIndex(); + } + Op = MI->getOperand(1); + if (Op.isImm()) + AM.Scale = Op.getImm(); + Op = MI->getOperand(2); + if (Op.isImm()) + AM.IndexReg = Op.getImm(); + Op = MI->getOperand(3); + if (Op.isGlobal()) { + AM.GV = Op.getGlobal(); + } else { + AM.Disp = Op.getImm(); + } + addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) + .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); + + // Reload the original control word now. + addFrameReference(BuildMI(*BB, MI, DL, + TII->get(X86::FLDCW16m)), CWFrameIdx); + + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; + } + // String/text processing lowering. + case X86::PCMPISTRM128REG: + case X86::VPCMPISTRM128REG: + case X86::PCMPISTRM128MEM: + case X86::VPCMPISTRM128MEM: + case X86::PCMPESTRM128REG: + case X86::VPCMPESTRM128REG: + case X86::PCMPESTRM128MEM: + case X86::VPCMPESTRM128MEM: + assert(Subtarget->hasSSE42() && + "Target must have SSE4.2 or AVX features enabled"); + return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo()); + + // String/text processing lowering. + case X86::PCMPISTRIREG: + case X86::VPCMPISTRIREG: + case X86::PCMPISTRIMEM: + case X86::VPCMPISTRIMEM: + case X86::PCMPESTRIREG: + case X86::VPCMPESTRIREG: + case X86::PCMPESTRIMEM: + case X86::VPCMPESTRIMEM: + assert(Subtarget->hasSSE42() && + "Target must have SSE4.2 or AVX features enabled"); + return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo()); + + // Thread synchronization. + case X86::MONITOR: + return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget); + + // xbegin + case X86::XBEGIN: + return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo()); + + // Atomic Lowering. + case X86::ATOMAND8: + case X86::ATOMAND16: + case X86::ATOMAND32: + case X86::ATOMAND64: + // Fall through + case X86::ATOMOR8: + case X86::ATOMOR16: + case X86::ATOMOR32: + case X86::ATOMOR64: + // Fall through + case X86::ATOMXOR16: + case X86::ATOMXOR8: + case X86::ATOMXOR32: + case X86::ATOMXOR64: + // Fall through + case X86::ATOMNAND8: + case X86::ATOMNAND16: + case X86::ATOMNAND32: + case X86::ATOMNAND64: + // Fall through + case X86::ATOMMAX8: + case X86::ATOMMAX16: + case X86::ATOMMAX32: + case X86::ATOMMAX64: + // Fall through + case X86::ATOMMIN8: + case X86::ATOMMIN16: + case X86::ATOMMIN32: + case X86::ATOMMIN64: + // Fall through + case X86::ATOMUMAX8: + case X86::ATOMUMAX16: + case X86::ATOMUMAX32: + case X86::ATOMUMAX64: + // Fall through + case X86::ATOMUMIN8: + case X86::ATOMUMIN16: + case X86::ATOMUMIN32: + case X86::ATOMUMIN64: + return EmitAtomicLoadArith(MI, BB); + + // This group does 64-bit operations on a 32-bit host. + case X86::ATOMAND6432: + case X86::ATOMOR6432: + case X86::ATOMXOR6432: + case X86::ATOMNAND6432: + case X86::ATOMADD6432: + case X86::ATOMSUB6432: + case X86::ATOMMAX6432: + case X86::ATOMMIN6432: + case X86::ATOMUMAX6432: + case X86::ATOMUMIN6432: + case X86::ATOMSWAP6432: + return EmitAtomicLoadArith6432(MI, BB); + + case X86::VASTART_SAVE_XMM_REGS: + return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); + + case X86::VAARG_64: + return EmitVAARG64WithCustomInserter(MI, BB); + + case X86::EH_SjLj_SetJmp32: + case X86::EH_SjLj_SetJmp64: + return emitEHSjLjSetJmp(MI, BB); + + case X86::EH_SjLj_LongJmp32: + case X86::EH_SjLj_LongJmp64: + return emitEHSjLjLongJmp(MI, BB); + } +} + +//===----------------------------------------------------------------------===// +// X86 Optimization Hooks +//===----------------------------------------------------------------------===// + +void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const { + unsigned BitWidth = KnownZero.getBitWidth(); + unsigned Opc = Op.getOpcode(); + assert((Opc >= ISD::BUILTIN_OP_END || + Opc == ISD::INTRINSIC_WO_CHAIN || + Opc == ISD::INTRINSIC_W_CHAIN || + Opc == ISD::INTRINSIC_VOID) && + "Should use MaskedValueIsZero if you don't know whether Op" + " is a target node!"); + + KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. + switch (Opc) { + default: break; + case X86ISD::ADD: + case X86ISD::SUB: + case X86ISD::ADC: + case X86ISD::SBB: + case X86ISD::SMUL: + case X86ISD::UMUL: + case X86ISD::INC: + case X86ISD::DEC: + case X86ISD::OR: + case X86ISD::XOR: + case X86ISD::AND: + // These nodes' second result is a boolean. + if (Op.getResNo() == 0) + break; + // Fallthrough + case X86ISD::SETCC: + KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); + break; + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + unsigned NumLoBits = 0; + switch (IntId) { + default: break; + case Intrinsic::x86_sse_movmsk_ps: + case Intrinsic::x86_avx_movmsk_ps_256: + case Intrinsic::x86_sse2_movmsk_pd: + case Intrinsic::x86_avx_movmsk_pd_256: + case Intrinsic::x86_mmx_pmovmskb: + case Intrinsic::x86_sse2_pmovmskb_128: + case Intrinsic::x86_avx2_pmovmskb: { + // High bits of movmskp{s|d}, pmovmskb are known zero. + switch (IntId) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break; + case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break; + case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break; + case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break; + case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break; + case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break; + case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break; + } + KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); + break; + } + } + break; + } + } +} + +unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, + unsigned Depth) const { + // SETCC_CARRY sets the dest to ~0 for true or 0 for false. + if (Op.getOpcode() == X86ISD::SETCC_CARRY) + return Op.getValueType().getScalarType().getSizeInBits(); + + // Fallback case. + return 1; +} + +/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the +/// node is a GlobalAddress + offset. +bool X86TargetLowering::isGAPlusOffset(SDNode *N, + const GlobalValue* &GA, + int64_t &Offset) const { + if (N->getOpcode() == X86ISD::Wrapper) { + if (isa<GlobalAddressSDNode>(N->getOperand(0))) { + GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); + Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); + return true; + } + } + return TargetLowering::isGAPlusOffset(N, GA, Offset); +} + +/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the +/// same as extracting the high 128-bit part of 256-bit vector and then +/// inserting the result into the low part of a new 256-bit vector +static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { + EVT VT = SVOp->getValueType(0); + unsigned NumElems = VT.getVectorNumElements(); + + // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> + for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j) + if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || + SVOp->getMaskElt(j) >= 0) + return false; + + return true; +} + +/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the +/// same as extracting the low 128-bit part of 256-bit vector and then +/// inserting the result into the high part of a new 256-bit vector +static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { + EVT VT = SVOp->getValueType(0); + unsigned NumElems = VT.getVectorNumElements(); + + // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> + for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j) + if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || + SVOp->getMaskElt(j) >= 0) + return false; + + return true; +} + +/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. +static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget* Subtarget) { + DebugLoc dl = N->getDebugLoc(); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + EVT VT = SVOp->getValueType(0); + unsigned NumElems = VT.getVectorNumElements(); + + if (V1.getOpcode() == ISD::CONCAT_VECTORS && + V2.getOpcode() == ISD::CONCAT_VECTORS) { + // + // 0,0,0,... + // | + // V UNDEF BUILD_VECTOR UNDEF + // \ / \ / + // CONCAT_VECTOR CONCAT_VECTOR + // \ / + // \ / + // RESULT: V + zero extended + // + if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || + V2.getOperand(1).getOpcode() != ISD::UNDEF || + V1.getOperand(1).getOpcode() != ISD::UNDEF) + return SDValue(); + + if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) + return SDValue(); + + // To match the shuffle mask, the first half of the mask should + // be exactly the first vector, and all the rest a splat with the + // first element of the second one. + for (unsigned i = 0; i != NumElems/2; ++i) + if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || + !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) + return SDValue(); + + // If V1 is coming from a vector load then just fold to a VZEXT_LOAD. + if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) { + if (Ld->hasNUsesOfValue(1, 0)) { + SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); + SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; + SDValue ResNode = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, + array_lengthof(Ops), + Ld->getMemoryVT(), + Ld->getPointerInfo(), + Ld->getAlignment(), + false/*isVolatile*/, true/*ReadMem*/, + false/*WriteMem*/); + + // Make sure the newly-created LOAD is in the same position as Ld in + // terms of dependency. We create a TokenFactor for Ld and ResNode, + // and update uses of Ld's output chain to use the TokenFactor. + if (Ld->hasAnyUseOfValue(1)) { + SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + SDValue(Ld, 1), SDValue(ResNode.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain); + DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1), + SDValue(ResNode.getNode(), 1)); + } + + return DAG.getNode(ISD::BITCAST, dl, VT, ResNode); + } + } + + // Emit a zeroed vector and insert the desired subvector on its + // first half. + SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); + SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl); + return DCI.CombineTo(N, InsV); + } + + //===--------------------------------------------------------------------===// + // Combine some shuffles into subvector extracts and inserts: + // + + // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> + if (isShuffleHigh128VectorInsertLow(SVOp)) { + SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl); + SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl); + return DCI.CombineTo(N, InsV); + } + + // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> + if (isShuffleLow128VectorInsertHigh(SVOp)) { + SDValue V = Extract128BitVector(V1, 0, DAG, dl); + SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl); + return DCI.CombineTo(N, InsV); + } + + return SDValue(); +} + +/// PerformShuffleCombine - Performs several different shuffle combines. +static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + DebugLoc dl = N->getDebugLoc(); + EVT VT = N->getValueType(0); + + // Don't create instructions with illegal types after legalize types has run. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) + return SDValue(); + + // Combine 256-bit vector shuffles. This is only profitable when in AVX mode + if (Subtarget->hasFp256() && VT.is256BitVector() && + N->getOpcode() == ISD::VECTOR_SHUFFLE) + return PerformShuffleCombine256(N, DAG, DCI, Subtarget); + + // Only handle 128 wide vector from here on. + if (!VT.is128BitVector()) + return SDValue(); + + // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, + // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are + // consecutive, non-overlapping, and in the right order. + SmallVector<SDValue, 16> Elts; + for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) + Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); + + return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); +} + +/// PerformTruncateCombine - Converts truncate operation to +/// a sequence of vector shuffle operations. +/// It is possible when we truncate 256-bit vector to 128-bit vector +static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + return SDValue(); +} + +/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target +/// specific shuffle of a load can be folded into a single element load. +/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but +/// shuffles have been customed lowered so we need to handle those here. +static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDValue InVec = N->getOperand(0); + SDValue EltNo = N->getOperand(1); + + if (!isa<ConstantSDNode>(EltNo)) + return SDValue(); + + EVT VT = InVec.getValueType(); + + bool HasShuffleIntoBitcast = false; + if (InVec.getOpcode() == ISD::BITCAST) { + // Don't duplicate a load with other uses. + if (!InVec.hasOneUse()) + return SDValue(); + EVT BCVT = InVec.getOperand(0).getValueType(); + if (BCVT.getVectorNumElements() != VT.getVectorNumElements()) + return SDValue(); + InVec = InVec.getOperand(0); + HasShuffleIntoBitcast = true; + } + + if (!isTargetShuffle(InVec.getOpcode())) + return SDValue(); + + // Don't duplicate a load with other uses. + if (!InVec.hasOneUse()) + return SDValue(); + + SmallVector<int, 16> ShuffleMask; + bool UnaryShuffle; + if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask, + UnaryShuffle)) + return SDValue(); + + // Select the input vector, guarding against out of range extract vector. + unsigned NumElems = VT.getVectorNumElements(); + int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); + int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt]; + SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) + : InVec.getOperand(1); + + // If inputs to shuffle are the same for both ops, then allow 2 uses + unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; + + if (LdNode.getOpcode() == ISD::BITCAST) { + // Don't duplicate a load with other uses. + if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0)) + return SDValue(); + + AllowedUses = 1; // only allow 1 load use if we have a bitcast + LdNode = LdNode.getOperand(0); + } + + if (!ISD::isNormalLoad(LdNode.getNode())) + return SDValue(); + + LoadSDNode *LN0 = cast<LoadSDNode>(LdNode); + + if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) + return SDValue(); + + if (HasShuffleIntoBitcast) { + // If there's a bitcast before the shuffle, check if the load type and + // alignment is valid. + unsigned Align = LN0->getAlignment(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned NewAlign = TLI.getDataLayout()-> + getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext())); + + if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) + return SDValue(); + } + + // All checks match so transform back to vector_shuffle so that DAG combiner + // can finish the job + DebugLoc dl = N->getDebugLoc(); + + // Create shuffle node taking into account the case that its a unary shuffle + SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1); + Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl, + InVec.getOperand(0), Shuffle, + &ShuffleMask[0]); + Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, + EltNo); +} + +/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index +/// generation and convert it from being a bunch of shuffles and extracts +/// to a simple store and scalar loads to extract the elements. +static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI); + if (NewOp.getNode()) + return NewOp; + + SDValue InputVector = N->getOperand(0); + // Detect whether we are trying to convert from mmx to i32 and the bitcast + // from mmx to v2i32 has a single usage. + if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST && + InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx && + InputVector.hasOneUse() && N->getValueType(0) == MVT::i32) + return DAG.getNode(X86ISD::MMX_MOVD2W, InputVector.getDebugLoc(), + N->getValueType(0), + InputVector.getNode()->getOperand(0)); + + // Only operate on vectors of 4 elements, where the alternative shuffling + // gets to be more expensive. + if (InputVector.getValueType() != MVT::v4i32) + return SDValue(); + + // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a + // single use which is a sign-extend or zero-extend, and all elements are + // used. + SmallVector<SDNode *, 4> Uses; + unsigned ExtractedElements = 0; + for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), + UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { + if (UI.getUse().getResNo() != InputVector.getResNo()) + return SDValue(); + + SDNode *Extract = *UI; + if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + if (Extract->getValueType(0) != MVT::i32) + return SDValue(); + if (!Extract->hasOneUse()) + return SDValue(); + if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && + Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) + return SDValue(); + if (!isa<ConstantSDNode>(Extract->getOperand(1))) + return SDValue(); + + // Record which element was extracted. + ExtractedElements |= + 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); + + Uses.push_back(Extract); + } + + // If not all the elements were used, this may not be worthwhile. + if (ExtractedElements != 15) + return SDValue(); + + // Ok, we've now decided to do the transformation. + DebugLoc dl = InputVector.getDebugLoc(); + + // Store the value to a temporary stack slot. + SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); + SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, + MachinePointerInfo(), false, false, 0); + + // Replace each use (extract) with a load of the appropriate element. + for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), + UE = Uses.end(); UI != UE; ++UI) { + SDNode *Extract = *UI; + + // cOMpute the element's address. + SDValue Idx = Extract->getOperand(1); + unsigned EltSize = + InputVector.getValueType().getVectorElementType().getSizeInBits()/8; + uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); + + SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), + StackPtr, OffsetVal); + + // Load the scalar. + SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, + ScalarAddr, MachinePointerInfo(), + false, false, false, 0); + + // Replace the exact with the load. + DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); + } + + // The replacement was made in place; don't return anything. + return SDValue(); +} + +/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match. +static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, + SDValue RHS, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + if (!VT.isVector()) + return 0; + + switch (VT.getSimpleVT().SimpleTy) { + default: return 0; + case MVT::v32i8: + case MVT::v16i16: + case MVT::v8i32: + if (!Subtarget->hasAVX2()) + return 0; + case MVT::v16i8: + case MVT::v8i16: + case MVT::v4i32: + if (!Subtarget->hasSSE2()) + return 0; + } + + // SSE2 has only a small subset of the operations. + bool hasUnsigned = Subtarget->hasSSE41() || + (Subtarget->hasSSE2() && VT == MVT::v16i8); + bool hasSigned = Subtarget->hasSSE41() || + (Subtarget->hasSSE2() && VT == MVT::v8i16); + + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + + // Check for x CC y ? x : y. + if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && + DAG.isEqualTo(RHS, Cond.getOperand(1))) { + switch (CC) { + default: break; + case ISD::SETULT: + case ISD::SETULE: + return hasUnsigned ? X86ISD::UMIN : 0; + case ISD::SETUGT: + case ISD::SETUGE: + return hasUnsigned ? X86ISD::UMAX : 0; + case ISD::SETLT: + case ISD::SETLE: + return hasSigned ? X86ISD::SMIN : 0; + case ISD::SETGT: + case ISD::SETGE: + return hasSigned ? X86ISD::SMAX : 0; + } + // Check for x CC y ? y : x -- a min/max with reversed arms. + } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && + DAG.isEqualTo(RHS, Cond.getOperand(0))) { + switch (CC) { + default: break; + case ISD::SETULT: + case ISD::SETULE: + return hasUnsigned ? X86ISD::UMAX : 0; + case ISD::SETUGT: + case ISD::SETUGE: + return hasUnsigned ? X86ISD::UMIN : 0; + case ISD::SETLT: + case ISD::SETLE: + return hasSigned ? X86ISD::SMAX : 0; + case ISD::SETGT: + case ISD::SETGE: + return hasSigned ? X86ISD::SMIN : 0; + } + } + + return 0; +} + +/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT +/// nodes. +static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + DebugLoc DL = N->getDebugLoc(); + SDValue Cond = N->getOperand(0); + // Get the LHS/RHS of the select. + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + EVT VT = LHS.getValueType(); + + // If we have SSE[12] support, try to form min/max nodes. SSE min/max + // instructions match the semantics of the common C idiom x<y?x:y but not + // x<=y?x:y, because of how they handle negative zero (which can be + // ignored in unsafe-math mode). + if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && + VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && + (Subtarget->hasSSE2() || + (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + + unsigned Opcode = 0; + // Check for x CC y ? x : y. + if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && + DAG.isEqualTo(RHS, Cond.getOperand(1))) { + switch (CC) { + default: break; + case ISD::SETULT: + // Converting this to a min would handle NaNs incorrectly, and swapping + // the operands would cause it to handle comparisons between positive + // and negative zero incorrectly. + if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { + if (!DAG.getTarget().Options.UnsafeFPMath && + !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) + break; + std::swap(LHS, RHS); + } + Opcode = X86ISD::FMIN; + break; + case ISD::SETOLE: + // Converting this to a min would handle comparisons between positive + // and negative zero incorrectly. + if (!DAG.getTarget().Options.UnsafeFPMath && + !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) + break; + Opcode = X86ISD::FMIN; + break; + case ISD::SETULE: + // Converting this to a min would handle both negative zeros and NaNs + // incorrectly, but we can swap the operands to fix both. + std::swap(LHS, RHS); + case ISD::SETOLT: + case ISD::SETLT: + case ISD::SETLE: + Opcode = X86ISD::FMIN; + break; + + case ISD::SETOGE: + // Converting this to a max would handle comparisons between positive + // and negative zero incorrectly. + if (!DAG.getTarget().Options.UnsafeFPMath && + !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) + break; + Opcode = X86ISD::FMAX; + break; + case ISD::SETUGT: + // Converting this to a max would handle NaNs incorrectly, and swapping + // the operands would cause it to handle comparisons between positive + // and negative zero incorrectly. + if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { + if (!DAG.getTarget().Options.UnsafeFPMath && + !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) + break; + std::swap(LHS, RHS); + } + Opcode = X86ISD::FMAX; + break; + case ISD::SETUGE: + // Converting this to a max would handle both negative zeros and NaNs + // incorrectly, but we can swap the operands to fix both. + std::swap(LHS, RHS); + case ISD::SETOGT: + case ISD::SETGT: + case ISD::SETGE: + Opcode = X86ISD::FMAX; + break; + } + // Check for x CC y ? y : x -- a min/max with reversed arms. + } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && + DAG.isEqualTo(RHS, Cond.getOperand(0))) { + switch (CC) { + default: break; + case ISD::SETOGE: + // Converting this to a min would handle comparisons between positive + // and negative zero incorrectly, and swapping the operands would + // cause it to handle NaNs incorrectly. + if (!DAG.getTarget().Options.UnsafeFPMath && + !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { + if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) + break; + std::swap(LHS, RHS); + } + Opcode = X86ISD::FMIN; + break; + case ISD::SETUGT: + // Converting this to a min would handle NaNs incorrectly. + if (!DAG.getTarget().Options.UnsafeFPMath && + (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) + break; + Opcode = X86ISD::FMIN; + break; + case ISD::SETUGE: + // Converting this to a min would handle both negative zeros and NaNs + // incorrectly, but we can swap the operands to fix both. + std::swap(LHS, RHS); + case ISD::SETOGT: + case ISD::SETGT: + case ISD::SETGE: + Opcode = X86ISD::FMIN; + break; + + case ISD::SETULT: + // Converting this to a max would handle NaNs incorrectly. + if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) + break; + Opcode = X86ISD::FMAX; + break; + case ISD::SETOLE: + // Converting this to a max would handle comparisons between positive + // and negative zero incorrectly, and swapping the operands would + // cause it to handle NaNs incorrectly. + if (!DAG.getTarget().Options.UnsafeFPMath && + !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { + if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) + break; + std::swap(LHS, RHS); + } + Opcode = X86ISD::FMAX; + break; + case ISD::SETULE: + // Converting this to a max would handle both negative zeros and NaNs + // incorrectly, but we can swap the operands to fix both. + std::swap(LHS, RHS); + case ISD::SETOLT: + case ISD::SETLT: + case ISD::SETLE: + Opcode = X86ISD::FMAX; + break; + } + } + + if (Opcode) + return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); + } + + // If this is a select between two integer constants, try to do some + // optimizations. + if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { + if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) + // Don't do this for crazy integer types. + if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { + // If this is efficiently invertible, canonicalize the LHSC/RHSC values + // so that TrueC (the true value) is larger than FalseC. + bool NeedsCondInvert = false; + + if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && + // Efficiently invertible. + (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. + (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. + isa<ConstantSDNode>(Cond.getOperand(1))))) { + NeedsCondInvert = true; + std::swap(TrueC, FalseC); + } + + // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. + if (FalseC->getAPIntValue() == 0 && + TrueC->getAPIntValue().isPowerOf2()) { + if (NeedsCondInvert) // Invert the condition if needed. + Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + DAG.getConstant(1, Cond.getValueType())); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); + + unsigned ShAmt = TrueC->getAPIntValue().logBase2(); + return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, + DAG.getConstant(ShAmt, MVT::i8)); + } + + // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. + if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { + if (NeedsCondInvert) // Invert the condition if needed. + Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + DAG.getConstant(1, Cond.getValueType())); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, + FalseC->getValueType(0), Cond); + return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + } + + // Optimize cases that will turn into an LEA instruction. This requires + // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). + if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { + uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); + if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; + + bool isFastMultiplier = false; + if (Diff < 10) { + switch ((unsigned char)Diff) { + default: break; + case 1: // result = add base, cond + case 2: // result = lea base( , cond*2) + case 3: // result = lea base(cond, cond*2) + case 4: // result = lea base( , cond*4) + case 5: // result = lea base(cond, cond*4) + case 8: // result = lea base( , cond*8) + case 9: // result = lea base(cond, cond*8) + isFastMultiplier = true; + break; + } + } + + if (isFastMultiplier) { + APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); + if (NeedsCondInvert) // Invert the condition if needed. + Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + DAG.getConstant(1, Cond.getValueType())); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), + Cond); + // Scale the condition by the difference. + if (Diff != 1) + Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, + DAG.getConstant(Diff, Cond.getValueType())); + + // Add the base if non-zero. + if (FalseC->getAPIntValue() != 0) + Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + return Cond; + } + } + } + } + + // Canonicalize max and min: + // (x > y) ? x : y -> (x >= y) ? x : y + // (x < y) ? x : y -> (x <= y) ? x : y + // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates + // the need for an extra compare + // against zero. e.g. + // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0 + // subl %esi, %edi + // testl %edi, %edi + // movl $0, %eax + // cmovgl %edi, %eax + // => + // xorl %eax, %eax + // subl %esi, $edi + // cmovsl %eax, %edi + if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && + DAG.isEqualTo(LHS, Cond.getOperand(0)) && + DAG.isEqualTo(RHS, Cond.getOperand(1))) { + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + switch (CC) { + default: break; + case ISD::SETLT: + case ISD::SETGT: { + ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; + Cond = DAG.getSetCC(Cond.getDebugLoc(), Cond.getValueType(), + Cond.getOperand(0), Cond.getOperand(1), NewCC); + return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); + } + } + } + + // Match VSELECTs into subs with unsigned saturation. + if (!DCI.isBeforeLegalize() && + N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && + // psubus is available in SSE2 and AVX2 for i8 and i16 vectors. + ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) || + (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) { + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + + // Check if one of the arms of the VSELECT is a zero vector. If it's on the + // left side invert the predicate to simplify logic below. + SDValue Other; + if (ISD::isBuildVectorAllZeros(LHS.getNode())) { + Other = RHS; + CC = ISD::getSetCCInverse(CC, true); + } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) { + Other = LHS; + } + + if (Other.getNode() && Other->getNumOperands() == 2 && + DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) { + SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1); + SDValue CondRHS = Cond->getOperand(1); + + // Look for a general sub with unsigned saturation first. + // x >= y ? x-y : 0 --> subus x, y + // x > y ? x-y : 0 --> subus x, y + if ((CC == ISD::SETUGE || CC == ISD::SETUGT) && + Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS)) + return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); + + // If the RHS is a constant we have to reverse the const canonicalization. + // x > C-1 ? x+-C : 0 --> subus x, C + if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && + isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) { + APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue(); + if (CondRHS.getConstantOperandVal(0) == -A-1) + return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, + DAG.getConstant(-A, VT)); + } + + // Another special case: If C was a sign bit, the sub has been + // canonicalized into a xor. + // FIXME: Would it be better to use ComputeMaskedBits to determine whether + // it's safe to decanonicalize the xor? + // x s< 0 ? x^C : 0 --> subus x, C + if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && + ISD::isBuildVectorAllZeros(CondRHS.getNode()) && + isSplatVector(OpRHS.getNode())) { + APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue(); + if (A.isSignBit()) + return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); + } + } + } + + // Try to match a min/max vector operation. + if (!DCI.isBeforeLegalize() && + N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) + if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget)) + return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS); + + // Simplify vector selection if the selector will be produced by CMPP*/PCMP*. + if (!DCI.isBeforeLegalize() && N->getOpcode() == ISD::VSELECT && + Cond.getOpcode() == ISD::SETCC) { + + assert(Cond.getValueType().isVector() && + "vector select expects a vector selector!"); + + EVT IntVT = Cond.getValueType(); + bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); + bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); + + if (!TValIsAllOnes && !FValIsAllZeros) { + // Try invert the condition if true value is not all 1s and false value + // is not all 0s. + bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); + bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); + + if (TValIsAllZeros || FValIsAllOnes) { + SDValue CC = Cond.getOperand(2); + ISD::CondCode NewCC = + ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), + Cond.getOperand(0).getValueType().isInteger()); + Cond = DAG.getSetCC(DL, IntVT, Cond.getOperand(0), Cond.getOperand(1), NewCC); + std::swap(LHS, RHS); + TValIsAllOnes = FValIsAllOnes; + FValIsAllZeros = TValIsAllZeros; + } + } + + if (TValIsAllOnes || FValIsAllZeros) { + SDValue Ret; + + if (TValIsAllOnes && FValIsAllZeros) + Ret = Cond; + else if (TValIsAllOnes) + Ret = DAG.getNode(ISD::OR, DL, IntVT, Cond, + DAG.getNode(ISD::BITCAST, DL, IntVT, RHS)); + else if (FValIsAllZeros) + Ret = DAG.getNode(ISD::AND, DL, IntVT, Cond, + DAG.getNode(ISD::BITCAST, DL, IntVT, LHS)); + + return DAG.getNode(ISD::BITCAST, DL, VT, Ret); + } + } + + // If we know that this node is legal then we know that it is going to be + // matched by one of the SSE/AVX BLEND instructions. These instructions only + // depend on the highest bit in each word. Try to use SimplifyDemandedBits + // to simplify previous instructions. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && + !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) { + unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); + + // Don't optimize vector selects that map to mask-registers. + if (BitWidth == 1) + return SDValue(); + + assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); + APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); + + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), + DCI.isBeforeLegalizeOps()); + if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || + TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO)) + DCI.CommitTargetLoweringOpt(TLO); + } + + return SDValue(); +} + +// Check whether a boolean test is testing a boolean value generated by +// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition +// code. +// +// Simplify the following patterns: +// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or +// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ) +// to (Op EFLAGS Cond) +// +// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or +// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ) +// to (Op EFLAGS !Cond) +// +// where Op could be BRCOND or CMOV. +// +static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { + // Quit if not CMP and SUB with its value result used. + if (Cmp.getOpcode() != X86ISD::CMP && + (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0))) + return SDValue(); + + // Quit if not used as a boolean value. + if (CC != X86::COND_E && CC != X86::COND_NE) + return SDValue(); + + // Check CMP operands. One of them should be 0 or 1 and the other should be + // an SetCC or extended from it. + SDValue Op1 = Cmp.getOperand(0); + SDValue Op2 = Cmp.getOperand(1); + + SDValue SetCC; + const ConstantSDNode* C = 0; + bool needOppositeCond = (CC == X86::COND_E); + bool checkAgainstTrue = false; // Is it a comparison against 1? + + if ((C = dyn_cast<ConstantSDNode>(Op1))) + SetCC = Op2; + else if ((C = dyn_cast<ConstantSDNode>(Op2))) + SetCC = Op1; + else // Quit if all operands are not constants. + return SDValue(); + + if (C->getZExtValue() == 1) { + needOppositeCond = !needOppositeCond; + checkAgainstTrue = true; + } else if (C->getZExtValue() != 0) + // Quit if the constant is neither 0 or 1. + return SDValue(); + + bool truncatedToBoolWithAnd = false; + // Skip (zext $x), (trunc $x), or (and $x, 1) node. + while (SetCC.getOpcode() == ISD::ZERO_EXTEND || + SetCC.getOpcode() == ISD::TRUNCATE || + SetCC.getOpcode() == ISD::AND) { + if (SetCC.getOpcode() == ISD::AND) { + int OpIdx = -1; + ConstantSDNode *CS; + if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) && + CS->getZExtValue() == 1) + OpIdx = 1; + if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) && + CS->getZExtValue() == 1) + OpIdx = 0; + if (OpIdx == -1) + break; + SetCC = SetCC.getOperand(OpIdx); + truncatedToBoolWithAnd = true; + } else + SetCC = SetCC.getOperand(0); + } + + switch (SetCC.getOpcode()) { + case X86ISD::SETCC_CARRY: + // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to + // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1, + // i.e. it's a comparison against true but the result of SETCC_CARRY is not + // truncated to i1 using 'and'. + if (checkAgainstTrue && !truncatedToBoolWithAnd) + break; + assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && + "Invalid use of SETCC_CARRY!"); + // FALL THROUGH + case X86ISD::SETCC: + // Set the condition code or opposite one if necessary. + CC = X86::CondCode(SetCC.getConstantOperandVal(0)); + if (needOppositeCond) + CC = X86::GetOppositeBranchCondition(CC); + return SetCC.getOperand(1); + case X86ISD::CMOV: { + // Check whether false/true value has canonical one, i.e. 0 or 1. + ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0)); + ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1)); + // Quit if true value is not a constant. + if (!TVal) + return SDValue(); + // Quit if false value is not a constant. + if (!FVal) { + SDValue Op = SetCC.getOperand(0); + // Skip 'zext' or 'trunc' node. + if (Op.getOpcode() == ISD::ZERO_EXTEND || + Op.getOpcode() == ISD::TRUNCATE) + Op = Op.getOperand(0); + // A special case for rdrand/rdseed, where 0 is set if false cond is + // found. + if ((Op.getOpcode() != X86ISD::RDRAND && + Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0) + return SDValue(); + } + // Quit if false value is not the constant 0 or 1. + bool FValIsFalse = true; + if (FVal && FVal->getZExtValue() != 0) { + if (FVal->getZExtValue() != 1) + return SDValue(); + // If FVal is 1, opposite cond is needed. + needOppositeCond = !needOppositeCond; + FValIsFalse = false; + } + // Quit if TVal is not the constant opposite of FVal. + if (FValIsFalse && TVal->getZExtValue() != 1) + return SDValue(); + if (!FValIsFalse && TVal->getZExtValue() != 0) + return SDValue(); + CC = X86::CondCode(SetCC.getConstantOperandVal(2)); + if (needOppositeCond) + CC = X86::GetOppositeBranchCondition(CC); + return SetCC.getOperand(3); + } + } + + return SDValue(); +} + +/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] +static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + DebugLoc DL = N->getDebugLoc(); + + // If the flag operand isn't dead, don't touch this CMOV. + if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) + return SDValue(); + + SDValue FalseOp = N->getOperand(0); + SDValue TrueOp = N->getOperand(1); + X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); + SDValue Cond = N->getOperand(3); + + if (CC == X86::COND_E || CC == X86::COND_NE) { + switch (Cond.getOpcode()) { + default: break; + case X86ISD::BSR: + case X86ISD::BSF: + // If operand of BSR / BSF are proven never zero, then ZF cannot be set. + if (DAG.isKnownNeverZero(Cond.getOperand(0))) + return (CC == X86::COND_E) ? FalseOp : TrueOp; + } + } + + SDValue Flags; + + Flags = checkBoolTestSetCCCombine(Cond, CC); + if (Flags.getNode() && + // Extra check as FCMOV only supports a subset of X86 cond. + (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) { + SDValue Ops[] = { FalseOp, TrueOp, + DAG.getConstant(CC, MVT::i8), Flags }; + return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), + Ops, array_lengthof(Ops)); + } + + // If this is a select between two integer constants, try to do some + // optimizations. Note that the operands are ordered the opposite of SELECT + // operands. + if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { + if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { + // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is + // larger than FalseC (the false value). + if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { + CC = X86::GetOppositeBranchCondition(CC); + std::swap(TrueC, FalseC); + std::swap(TrueOp, FalseOp); + } + + // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. + // This is efficient for any integer data type (including i8/i16) and + // shift amount. + if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { + Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(CC, MVT::i8), Cond); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); + + unsigned ShAmt = TrueC->getAPIntValue().logBase2(); + Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, + DAG.getConstant(ShAmt, MVT::i8)); + if (N->getNumValues() == 2) // Dead flag value? + return DCI.CombineTo(N, Cond, SDValue()); + return Cond; + } + + // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient + // for any integer data type, including i8/i16. + if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { + Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(CC, MVT::i8), Cond); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, + FalseC->getValueType(0), Cond); + Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + + if (N->getNumValues() == 2) // Dead flag value? + return DCI.CombineTo(N, Cond, SDValue()); + return Cond; + } + + // Optimize cases that will turn into an LEA instruction. This requires + // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). + if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { + uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); + if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; + + bool isFastMultiplier = false; + if (Diff < 10) { + switch ((unsigned char)Diff) { + default: break; + case 1: // result = add base, cond + case 2: // result = lea base( , cond*2) + case 3: // result = lea base(cond, cond*2) + case 4: // result = lea base( , cond*4) + case 5: // result = lea base(cond, cond*4) + case 8: // result = lea base( , cond*8) + case 9: // result = lea base(cond, cond*8) + isFastMultiplier = true; + break; + } + } + + if (isFastMultiplier) { + APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); + Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(CC, MVT::i8), Cond); + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), + Cond); + // Scale the condition by the difference. + if (Diff != 1) + Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, + DAG.getConstant(Diff, Cond.getValueType())); + + // Add the base if non-zero. + if (FalseC->getAPIntValue() != 0) + Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + if (N->getNumValues() == 2) // Dead flag value? + return DCI.CombineTo(N, Cond, SDValue()); + return Cond; + } + } + } + } + + // Handle these cases: + // (select (x != c), e, c) -> select (x != c), e, x), + // (select (x == c), c, e) -> select (x == c), x, e) + // where the c is an integer constant, and the "select" is the combination + // of CMOV and CMP. + // + // The rationale for this change is that the conditional-move from a constant + // needs two instructions, however, conditional-move from a register needs + // only one instruction. + // + // CAVEAT: By replacing a constant with a symbolic value, it may obscure + // some instruction-combining opportunities. This opt needs to be + // postponed as late as possible. + // + if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) { + // the DCI.xxxx conditions are provided to postpone the optimization as + // late as possible. + + ConstantSDNode *CmpAgainst = 0; + if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && + (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) && + !isa<ConstantSDNode>(Cond.getOperand(0))) { + + if (CC == X86::COND_NE && + CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) { + CC = X86::GetOppositeBranchCondition(CC); + std::swap(TrueOp, FalseOp); + } + + if (CC == X86::COND_E && + CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) { + SDValue Ops[] = { FalseOp, Cond.getOperand(0), + DAG.getConstant(CC, MVT::i8), Cond }; + return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops, + array_lengthof(Ops)); + } + } + } + + return SDValue(); +} + +/// PerformMulCombine - Optimize a single multiply with constant into two +/// in order to implement it with two cheaper instructions, e.g. +/// LEA + SHL, LEA + LEA. +static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + EVT VT = N->getValueType(0); + if (VT != MVT::i64) + return SDValue(); + + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!C) + return SDValue(); + uint64_t MulAmt = C->getZExtValue(); + if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) + return SDValue(); + + uint64_t MulAmt1 = 0; + uint64_t MulAmt2 = 0; + if ((MulAmt % 9) == 0) { + MulAmt1 = 9; + MulAmt2 = MulAmt / 9; + } else if ((MulAmt % 5) == 0) { + MulAmt1 = 5; + MulAmt2 = MulAmt / 5; + } else if ((MulAmt % 3) == 0) { + MulAmt1 = 3; + MulAmt2 = MulAmt / 3; + } + if (MulAmt2 && + (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ + DebugLoc DL = N->getDebugLoc(); + + if (isPowerOf2_64(MulAmt2) && + !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) + // If second multiplifer is pow2, issue it first. We want the multiply by + // 3, 5, or 9 to be folded into the addressing mode unless the lone use + // is an add. + std::swap(MulAmt1, MulAmt2); + + SDValue NewMul; + if (isPowerOf2_64(MulAmt1)) + NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); + else + NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), + DAG.getConstant(MulAmt1, VT)); + + if (isPowerOf2_64(MulAmt2)) + NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, + DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); + else + NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, + DAG.getConstant(MulAmt2, VT)); + + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, NewMul, false); + } + return SDValue(); +} + +static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); + EVT VT = N0.getValueType(); + + // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) + // since the result of setcc_c is all zero's or all ones. + if (VT.isInteger() && !VT.isVector() && + N1C && N0.getOpcode() == ISD::AND && + N0.getOperand(1).getOpcode() == ISD::Constant) { + SDValue N00 = N0.getOperand(0); + if (N00.getOpcode() == X86ISD::SETCC_CARRY || + ((N00.getOpcode() == ISD::ANY_EXTEND || + N00.getOpcode() == ISD::ZERO_EXTEND) && + N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { + APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); + APInt ShAmt = N1C->getAPIntValue(); + Mask = Mask.shl(ShAmt); + if (Mask != 0) + return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, + N00, DAG.getConstant(Mask, VT)); + } + } + + // Hardware support for vector shifts is sparse which makes us scalarize the + // vector operations in many cases. Also, on sandybridge ADD is faster than + // shl. + // (shl V, 1) -> add V,V + if (isSplatVector(N1.getNode())) { + assert(N0.getValueType().isVector() && "Invalid vector shift type"); + ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0)); + // We shift all of the values by one. In many cases we do not have + // hardware support for this operation. This is better expressed as an ADD + // of two values. + if (N1C && (1 == N1C->getZExtValue())) { + return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0); + } + } + + return SDValue(); +} + +/// PerformShiftCombine - Combine shifts. +static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + if (N->getOpcode() == ISD::SHL) { + SDValue V = PerformSHLCombine(N, DAG); + if (V.getNode()) return V; + } + + return SDValue(); +} + +// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) +// where both setccs reference the same FP CMP, and rewrite for CMPEQSS +// and friends. Likewise for OR -> CMPNEQSS. +static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + unsigned opcode; + + // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but + // we're requiring SSE2 for both. + if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue CMP0 = N0->getOperand(1); + SDValue CMP1 = N1->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + + // The SETCCs should both refer to the same CMP. + if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) + return SDValue(); + + SDValue CMP00 = CMP0->getOperand(0); + SDValue CMP01 = CMP0->getOperand(1); + EVT VT = CMP00.getValueType(); + + if (VT == MVT::f32 || VT == MVT::f64) { + bool ExpectingFlags = false; + // Check for any users that want flags: + for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); + !ExpectingFlags && UI != UE; ++UI) + switch (UI->getOpcode()) { + default: + case ISD::BR_CC: + case ISD::BRCOND: + case ISD::SELECT: + ExpectingFlags = true; + break; + case ISD::CopyToReg: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + break; + } + + if (!ExpectingFlags) { + enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); + enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); + + if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { + X86::CondCode tmp = cc0; + cc0 = cc1; + cc1 = tmp; + } + + if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || + (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { + bool is64BitFP = (CMP00.getValueType() == MVT::f64); + X86ISD::NodeType NTOperator = is64BitFP ? + X86ISD::FSETCCsd : X86ISD::FSETCCss; + // FIXME: need symbolic constants for these magic numbers. + // See X86ATTInstPrinter.cpp:printSSECC(). + unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; + SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01, + DAG.getConstant(x86cc, MVT::i8)); + SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32, + OnesOrZeroesF); + SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI, + DAG.getConstant(1, MVT::i32)); + SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); + return OneBitOfTruth; + } + } + } + } + return SDValue(); +} + +/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector +/// so it can be folded inside ANDNP. +static bool CanFoldXORWithAllOnes(const SDNode *N) { + EVT VT = N->getValueType(0); + + // Match direct AllOnes for 128 and 256-bit vectors + if (ISD::isBuildVectorAllOnes(N)) + return true; + + // Look through a bit convert. + if (N->getOpcode() == ISD::BITCAST) + N = N->getOperand(0).getNode(); + + // Sometimes the operand may come from a insert_subvector building a 256-bit + // allones vector + if (VT.is256BitVector() && + N->getOpcode() == ISD::INSERT_SUBVECTOR) { + SDValue V1 = N->getOperand(0); + SDValue V2 = N->getOperand(1); + + if (V1.getOpcode() == ISD::INSERT_SUBVECTOR && + V1.getOperand(0).getOpcode() == ISD::UNDEF && + ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && + ISD::isBuildVectorAllOnes(V2.getNode())) + return true; + } + + return false; +} + +// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized +// register. In most cases we actually compare or select YMM-sized registers +// and mixing the two types creates horrible code. This method optimizes +// some of the transition sequences. +static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (!VT.is256BitVector()) + return SDValue(); + + assert((N->getOpcode() == ISD::ANY_EXTEND || + N->getOpcode() == ISD::ZERO_EXTEND || + N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node"); + + SDValue Narrow = N->getOperand(0); + EVT NarrowVT = Narrow->getValueType(0); + if (!NarrowVT.is128BitVector()) + return SDValue(); + + if (Narrow->getOpcode() != ISD::XOR && + Narrow->getOpcode() != ISD::AND && + Narrow->getOpcode() != ISD::OR) + return SDValue(); + + SDValue N0 = Narrow->getOperand(0); + SDValue N1 = Narrow->getOperand(1); + DebugLoc DL = Narrow->getDebugLoc(); + + // The Left side has to be a trunc. + if (N0.getOpcode() != ISD::TRUNCATE) + return SDValue(); + + // The type of the truncated inputs. + EVT WideVT = N0->getOperand(0)->getValueType(0); + if (WideVT != VT) + return SDValue(); + + // The right side has to be a 'trunc' or a constant vector. + bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE; + bool RHSConst = (isSplatVector(N1.getNode()) && + isa<ConstantSDNode>(N1->getOperand(0))); + if (!RHSTrunc && !RHSConst) + return SDValue(); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT)) + return SDValue(); + + // Set N0 and N1 to hold the inputs to the new wide operation. + N0 = N0->getOperand(0); + if (RHSConst) { + N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(), + N1->getOperand(0)); + SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1); + N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size()); + } else if (RHSTrunc) { + N1 = N1->getOperand(0); + } + + // Generate the wide operation. + SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1); + unsigned Opcode = N->getOpcode(); + switch (Opcode) { + case ISD::ANY_EXTEND: + return Op; + case ISD::ZERO_EXTEND: { + unsigned InBits = NarrowVT.getScalarType().getSizeInBits(); + APInt Mask = APInt::getAllOnesValue(InBits); + Mask = Mask.zext(VT.getScalarType().getSizeInBits()); + return DAG.getNode(ISD::AND, DL, VT, + Op, DAG.getConstant(Mask, VT)); + } + case ISD::SIGN_EXTEND: + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, + Op, DAG.getValueType(NarrowVT)); + default: + llvm_unreachable("Unexpected opcode"); + } +} + +static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); + if (R.getNode()) + return R; + + // Create BLSI, and BLSR instructions + // BLSI is X & (-X) + // BLSR is X & (X-1) + if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + + // Check LHS for neg + if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 && + isZero(N0.getOperand(0))) + return DAG.getNode(X86ISD::BLSI, DL, VT, N1); + + // Check RHS for neg + if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 && + isZero(N1.getOperand(0))) + return DAG.getNode(X86ISD::BLSI, DL, VT, N0); + + // Check LHS for X-1 + if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && + isAllOnes(N0.getOperand(1))) + return DAG.getNode(X86ISD::BLSR, DL, VT, N1); + + // Check RHS for X-1 + if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && + isAllOnes(N1.getOperand(1))) + return DAG.getNode(X86ISD::BLSR, DL, VT, N0); + + return SDValue(); + } + + // Want to form ANDNP nodes: + // 1) In the hopes of then easily combining them with OR and AND nodes + // to form PBLEND/PSIGN. + // 2) To match ANDN packed intrinsics + if (VT != MVT::v2i64 && VT != MVT::v4i64) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + + // Check LHS for vnot + if (N0.getOpcode() == ISD::XOR && + //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) + CanFoldXORWithAllOnes(N0.getOperand(1).getNode())) + return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); + + // Check RHS for vnot + if (N1.getOpcode() == ISD::XOR && + //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) + CanFoldXORWithAllOnes(N1.getOperand(1).getNode())) + return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); + + return SDValue(); +} + +static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); + if (R.getNode()) + return R; + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // look for psign/blend + if (VT == MVT::v2i64 || VT == MVT::v4i64) { + if (!Subtarget->hasSSSE3() || + (VT == MVT::v4i64 && !Subtarget->hasInt256())) + return SDValue(); + + // Canonicalize pandn to RHS + if (N0.getOpcode() == X86ISD::ANDNP) + std::swap(N0, N1); + // or (and (m, y), (pandn m, x)) + if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { + SDValue Mask = N1.getOperand(0); + SDValue X = N1.getOperand(1); + SDValue Y; + if (N0.getOperand(0) == Mask) + Y = N0.getOperand(1); + if (N0.getOperand(1) == Mask) + Y = N0.getOperand(0); + + // Check to see if the mask appeared in both the AND and ANDNP and + if (!Y.getNode()) + return SDValue(); + + // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. + // Look through mask bitcast. + if (Mask.getOpcode() == ISD::BITCAST) + Mask = Mask.getOperand(0); + if (X.getOpcode() == ISD::BITCAST) + X = X.getOperand(0); + if (Y.getOpcode() == ISD::BITCAST) + Y = Y.getOperand(0); + + EVT MaskVT = Mask.getValueType(); + + // Validate that the Mask operand is a vector sra node. + // FIXME: what to do for bytes, since there is a psignb/pblendvb, but + // there is no psrai.b + unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); + unsigned SraAmt = ~0; + if (Mask.getOpcode() == ISD::SRA) { + SDValue Amt = Mask.getOperand(1); + if (isSplatVector(Amt.getNode())) { + SDValue SclrAmt = Amt->getOperand(0); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) + SraAmt = C->getZExtValue(); + } + } else if (Mask.getOpcode() == X86ISD::VSRAI) { + SDValue SraC = Mask.getOperand(1); + SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); + } + if ((SraAmt + 1) != EltBits) + return SDValue(); + + DebugLoc DL = N->getDebugLoc(); + + // Now we know we at least have a plendvb with the mask val. See if + // we can form a psignb/w/d. + // psign = x.type == y.type == mask.type && y = sub(0, x); + if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && + ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && + X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { + assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && + "Unsupported VT for PSIGN"); + Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0)); + return DAG.getNode(ISD::BITCAST, DL, VT, Mask); + } + // PBLENDVB only available on SSE 4.1 + if (!Subtarget->hasSSE41()) + return SDValue(); + + EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; + + X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X); + Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y); + Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask); + Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); + return DAG.getNode(ISD::BITCAST, DL, VT, Mask); + } + } + + if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) + if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) + std::swap(N0, N1); + if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) + return SDValue(); + if (!N0.hasOneUse() || !N1.hasOneUse()) + return SDValue(); + + SDValue ShAmt0 = N0.getOperand(1); + if (ShAmt0.getValueType() != MVT::i8) + return SDValue(); + SDValue ShAmt1 = N1.getOperand(1); + if (ShAmt1.getValueType() != MVT::i8) + return SDValue(); + if (ShAmt0.getOpcode() == ISD::TRUNCATE) + ShAmt0 = ShAmt0.getOperand(0); + if (ShAmt1.getOpcode() == ISD::TRUNCATE) + ShAmt1 = ShAmt1.getOperand(0); + + DebugLoc DL = N->getDebugLoc(); + unsigned Opc = X86ISD::SHLD; + SDValue Op0 = N0.getOperand(0); + SDValue Op1 = N1.getOperand(0); + if (ShAmt0.getOpcode() == ISD::SUB) { + Opc = X86ISD::SHRD; + std::swap(Op0, Op1); + std::swap(ShAmt0, ShAmt1); + } + + unsigned Bits = VT.getSizeInBits(); + if (ShAmt1.getOpcode() == ISD::SUB) { + SDValue Sum = ShAmt1.getOperand(0); + if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { + SDValue ShAmt1Op1 = ShAmt1.getOperand(1); + if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) + ShAmt1Op1 = ShAmt1Op1.getOperand(0); + if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) + return DAG.getNode(Opc, DL, VT, + Op0, Op1, + DAG.getNode(ISD::TRUNCATE, DL, + MVT::i8, ShAmt0)); + } + } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { + ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); + if (ShAmt0C && + ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) + return DAG.getNode(Opc, DL, VT, + N0.getOperand(0), N1.getOperand(0), + DAG.getNode(ISD::TRUNCATE, DL, + MVT::i8, ShAmt0)); + } + + return SDValue(); +} + +// Generate NEG and CMOV for integer abs. +static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + + // Since X86 does not have CMOV for 8-bit integer, we don't convert + // 8-bit integer abs to NEG and CMOV. + if (VT.isInteger() && VT.getSizeInBits() == 8) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + + // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) + // and change it to SUB and CMOV. + if (VT.isInteger() && N->getOpcode() == ISD::XOR && + N0.getOpcode() == ISD::ADD && + N0.getOperand(1) == N1 && + N1.getOpcode() == ISD::SRA && + N1.getOperand(0) == N0.getOperand(0)) + if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) + if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) { + // Generate SUB & CMOV. + SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), + DAG.getConstant(0, VT), N0.getOperand(0)); + + SDValue Ops[] = { N0.getOperand(0), Neg, + DAG.getConstant(X86::COND_GE, MVT::i8), + SDValue(Neg.getNode(), 1) }; + return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), + Ops, array_lengthof(Ops)); + } + return SDValue(); +} + +// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes +static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + if (Subtarget->hasCMov()) { + SDValue RV = performIntegerAbsCombine(N, DAG); + if (RV.getNode()) + return RV; + } + + // Try forming BMI if it is available. + if (!Subtarget->hasBMI()) + return SDValue(); + + if (VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions"); + + // Create BLSMSK instructions by finding X ^ (X-1) + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + + if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && + isAllOnes(N0.getOperand(1))) + return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1); + + if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && + isAllOnes(N1.getOperand(1))) + return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0); + + return SDValue(); +} + +/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. +static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + LoadSDNode *Ld = cast<LoadSDNode>(N); + EVT RegVT = Ld->getValueType(0); + EVT MemVT = Ld->getMemoryVT(); + DebugLoc dl = Ld->getDebugLoc(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned RegSz = RegVT.getSizeInBits(); + + // On Sandybridge unaligned 256bit loads are inefficient. + ISD::LoadExtType Ext = Ld->getExtensionType(); + unsigned Alignment = Ld->getAlignment(); + bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8; + if (RegVT.is256BitVector() && !Subtarget->hasInt256() && + !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { + unsigned NumElems = RegVT.getVectorNumElements(); + if (NumElems < 2) + return SDValue(); + + SDValue Ptr = Ld->getBasePtr(); + SDValue Increment = DAG.getConstant(16, TLI.getPointerTy()); + + EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), + NumElems/2); + SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, + Ld->getPointerInfo(), Ld->isVolatile(), + Ld->isNonTemporal(), Ld->isInvariant(), + Alignment); + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, + Ld->getPointerInfo(), Ld->isVolatile(), + Ld->isNonTemporal(), Ld->isInvariant(), + std::min(16U, Alignment)); + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + Load1.getValue(1), + Load2.getValue(1)); + + SDValue NewVec = DAG.getUNDEF(RegVT); + NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl); + NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl); + return DCI.CombineTo(N, NewVec, TF, true); + } + + // If this is a vector EXT Load then attempt to optimize it using a + // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the + // expansion is still better than scalar code. + // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll + // emit a shuffle and a arithmetic shift. + // TODO: It is possible to support ZExt by zeroing the undef values + // during the shuffle phase or after the shuffle. + if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() && + (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) { + assert(MemVT != RegVT && "Cannot extend to the same type"); + assert(MemVT.isVector() && "Must load a vector from memory"); + + unsigned NumElems = RegVT.getVectorNumElements(); + unsigned MemSz = MemVT.getSizeInBits(); + assert(RegSz > MemSz && "Register size must be greater than the mem size"); + + if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) + return SDValue(); + + // All sizes must be a power of two. + if (!isPowerOf2_32(RegSz * MemSz * NumElems)) + return SDValue(); + + // Attempt to load the original value using scalar loads. + // Find the largest scalar type that divides the total loaded size. + MVT SclrLoadTy = MVT::i8; + for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; + tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { + MVT Tp = (MVT::SimpleValueType)tp; + if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { + SclrLoadTy = Tp; + } + } + + // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. + if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && + (64 <= MemSz)) + SclrLoadTy = MVT::f64; + + // Calculate the number of scalar loads that we need to perform + // in order to load our vector from memory. + unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); + if (Ext == ISD::SEXTLOAD && NumLoads > 1) + return SDValue(); + + unsigned loadRegZize = RegSz; + if (Ext == ISD::SEXTLOAD && RegSz == 256) + loadRegZize /= 2; + + // Represent our vector as a sequence of elements which are the + // largest scalar that we can load. + EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy, + loadRegZize/SclrLoadTy.getSizeInBits()); + + // Represent the data using the same element type that is stored in + // memory. In practice, we ''widen'' MemVT. + EVT WideVecVT = + EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), + loadRegZize/MemVT.getScalarType().getSizeInBits()); + + assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && + "Invalid vector type"); + + // We can't shuffle using an illegal type. + if (!TLI.isTypeLegal(WideVecVT)) + return SDValue(); + + SmallVector<SDValue, 8> Chains; + SDValue Ptr = Ld->getBasePtr(); + SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8, + TLI.getPointerTy()); + SDValue Res = DAG.getUNDEF(LoadUnitVecVT); + + for (unsigned i = 0; i < NumLoads; ++i) { + // Perform a single load. + SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), + Ptr, Ld->getPointerInfo(), + Ld->isVolatile(), Ld->isNonTemporal(), + Ld->isInvariant(), Ld->getAlignment()); + Chains.push_back(ScalarLoad.getValue(1)); + // Create the first element type using SCALAR_TO_VECTOR in order to avoid + // another round of DAGCombining. + if (i == 0) + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); + else + Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, + ScalarLoad, DAG.getIntPtrConstant(i)); + + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + } + + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], + Chains.size()); + + // Bitcast the loaded value to a vector of the original element type, in + // the size of the target vector type. + SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res); + unsigned SizeRatio = RegSz/MemSz; + + if (Ext == ISD::SEXTLOAD) { + // If we have SSE4.1 we can directly emit a VSEXT node. + if (Subtarget->hasSSE41()) { + SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); + return DCI.CombineTo(N, Sext, TF, true); + } + + // Otherwise we'll shuffle the small elements in the high bits of the + // larger type and perform an arithmetic shift. If the shift is not legal + // it's better to scalarize. + if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT)) + return SDValue(); + + // Redistribute the loaded elements into the different locations. + SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i*SizeRatio + SizeRatio-1] = i; + + SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, + DAG.getUNDEF(WideVecVT), + &ShuffleVec[0]); + + Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); + + // Build the arithmetic shift. + unsigned Amt = RegVT.getVectorElementType().getSizeInBits() - + MemVT.getVectorElementType().getSizeInBits(); + Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff, + DAG.getConstant(Amt, RegVT)); + + return DCI.CombineTo(N, Shuff, TF, true); + } + + // Redistribute the loaded elements into the different locations. + SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i*SizeRatio] = i; + + SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, + DAG.getUNDEF(WideVecVT), + &ShuffleVec[0]); + + // Bitcast to the requested type. + Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); + // Replace the original load with the new sequence + // and return the new chain. + return DCI.CombineTo(N, Shuff, TF, true); + } + + return SDValue(); +} + +/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. +static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + StoreSDNode *St = cast<StoreSDNode>(N); + EVT VT = St->getValue().getValueType(); + EVT StVT = St->getMemoryVT(); + DebugLoc dl = St->getDebugLoc(); + SDValue StoredVal = St->getOperand(1); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // If we are saving a concatenation of two XMM registers, perform two stores. + // On Sandy Bridge, 256-bit memory operations are executed by two + // 128-bit ports. However, on Haswell it is better to issue a single 256-bit + // memory operation. + unsigned Alignment = St->getAlignment(); + bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8; + if (VT.is256BitVector() && !Subtarget->hasInt256() && + StVT == VT && !IsAligned) { + unsigned NumElems = VT.getVectorNumElements(); + if (NumElems < 2) + return SDValue(); + + SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl); + SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl); + + SDValue Stride = DAG.getConstant(16, TLI.getPointerTy()); + SDValue Ptr0 = St->getBasePtr(); + SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); + + SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), Alignment); + SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), + std::min(16U, Alignment)); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); + } + + // Optimize trunc store (of multiple scalars) to shuffle and store. + // First, pack all of the elements in one place. Next, store to memory + // in fewer chunks. + if (St->isTruncatingStore() && VT.isVector()) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned NumElems = VT.getVectorNumElements(); + assert(StVT != VT && "Cannot truncate to the same type"); + unsigned FromSz = VT.getVectorElementType().getSizeInBits(); + unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); + + // From, To sizes and ElemCount must be pow of two + if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); + // We are going to use the original vector elt for storing. + // Accumulated smaller vector elements must be a multiple of the store size. + if (0 != (NumElems * FromSz) % ToSz) return SDValue(); + + unsigned SizeRatio = FromSz / ToSz; + + assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); + + // Create a type on which we perform the shuffle + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), + StVT.getScalarType(), NumElems*SizeRatio); + + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + + SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue()); + SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + + // Can't shuffle using an illegal type. + if (!TLI.isTypeLegal(WideVecVT)) + return SDValue(); + + SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, + DAG.getUNDEF(WideVecVT), + &ShuffleVec[0]); + // At this point all of the data is stored at the bottom of the + // register. We now need to save it to mem. + + // Find the largest store unit + MVT StoreType = MVT::i8; + for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; + tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { + MVT Tp = (MVT::SimpleValueType)tp; + if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) + StoreType = Tp; + } + + // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. + if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 && + (64 <= NumElems * ToSz)) + StoreType = MVT::f64; + + // Bitcast the original vector into a vector of store-size units + EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), + StoreType, VT.getSizeInBits()/StoreType.getSizeInBits()); + assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); + SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff); + SmallVector<SDValue, 8> Chains; + SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, + TLI.getPointerTy()); + SDValue Ptr = St->getBasePtr(); + + // Perform one or more big stores into memory. + for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) { + SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, + StoreType, ShuffWide, + DAG.getIntPtrConstant(i)); + SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + Chains.push_back(Ch); + } + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], + Chains.size()); + } + + // Turn load->store of MMX types into GPR load/stores. This avoids clobbering + // the FP state in cases where an emms may be missing. + // A preferable solution to the general problem is to figure out the right + // places to insert EMMS. This qualifies as a quick hack. + + // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. + if (VT.getSizeInBits() != 64) + return SDValue(); + + const Function *F = DAG.getMachineFunction().getFunction(); + bool NoImplicitFloatOps = F->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); + bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps + && Subtarget->hasSSE2(); + if ((VT.isVector() || + (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && + isa<LoadSDNode>(St->getValue()) && + !cast<LoadSDNode>(St->getValue())->isVolatile() && + St->getChain().hasOneUse() && !St->isVolatile()) { + SDNode* LdVal = St->getValue().getNode(); + LoadSDNode *Ld = 0; + int TokenFactorIndex = -1; + SmallVector<SDValue, 8> Ops; + SDNode* ChainVal = St->getChain().getNode(); + // Must be a store of a load. We currently handle two cases: the load + // is a direct child, and it's under an intervening TokenFactor. It is + // possible to dig deeper under nested TokenFactors. + if (ChainVal == LdVal) + Ld = cast<LoadSDNode>(St->getChain()); + else if (St->getValue().hasOneUse() && + ChainVal->getOpcode() == ISD::TokenFactor) { + for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) { + if (ChainVal->getOperand(i).getNode() == LdVal) { + TokenFactorIndex = i; + Ld = cast<LoadSDNode>(St->getValue()); + } else + Ops.push_back(ChainVal->getOperand(i)); + } + } + + if (!Ld || !ISD::isNormalLoad(Ld)) + return SDValue(); + + // If this is not the MMX case, i.e. we are just turning i64 load/store + // into f64 load/store, avoid the transformation if there are multiple + // uses of the loaded value. + if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) + return SDValue(); + + DebugLoc LdDL = Ld->getDebugLoc(); + DebugLoc StDL = N->getDebugLoc(); + // If we are a 64-bit capable x86, lower to a single movq load/store pair. + // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store + // pair instead. + if (Subtarget->is64Bit() || F64IsLegal) { + EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; + SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), Ld->isVolatile(), + Ld->isNonTemporal(), Ld->isInvariant(), + Ld->getAlignment()); + SDValue NewChain = NewLd.getValue(1); + if (TokenFactorIndex != -1) { + Ops.push_back(NewChain); + NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], + Ops.size()); + } + return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), + St->getPointerInfo(), + St->isVolatile(), St->isNonTemporal(), + St->getAlignment()); + } + + // Otherwise, lower to two pairs of 32-bit loads / stores. + SDValue LoAddr = Ld->getBasePtr(); + SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, + DAG.getConstant(4, MVT::i32)); + + SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, + Ld->getPointerInfo(), + Ld->isVolatile(), Ld->isNonTemporal(), + Ld->isInvariant(), Ld->getAlignment()); + SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, + Ld->getPointerInfo().getWithOffset(4), + Ld->isVolatile(), Ld->isNonTemporal(), + Ld->isInvariant(), + MinAlign(Ld->getAlignment(), 4)); + + SDValue NewChain = LoLd.getValue(1); + if (TokenFactorIndex != -1) { + Ops.push_back(LoLd); + Ops.push_back(HiLd); + NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], + Ops.size()); + } + + LoAddr = St->getBasePtr(); + HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, + DAG.getConstant(4, MVT::i32)); + + SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, + St->getPointerInfo(), + St->isVolatile(), St->isNonTemporal(), + St->getAlignment()); + SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, + St->getPointerInfo().getWithOffset(4), + St->isVolatile(), + St->isNonTemporal(), + MinAlign(St->getAlignment(), 4)); + return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); + } + return SDValue(); +} + +/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal" +/// and return the operands for the horizontal operation in LHS and RHS. A +/// horizontal operation performs the binary operation on successive elements +/// of its first operand, then on successive elements of its second operand, +/// returning the resulting values in a vector. For example, if +/// A = < float a0, float a1, float a2, float a3 > +/// and +/// B = < float b0, float b1, float b2, float b3 > +/// then the result of doing a horizontal operation on A and B is +/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >. +/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form +/// A horizontal-op B, for some already available A and B, and if so then LHS is +/// set to A, RHS to B, and the routine returns 'true'. +/// Note that the binary operation should have the property that if one of the +/// operands is UNDEF then the result is UNDEF. +static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { + // Look for the following pattern: if + // A = < float a0, float a1, float a2, float a3 > + // B = < float b0, float b1, float b2, float b3 > + // and + // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6> + // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7> + // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > + // which is A horizontal-op B. + + // At least one of the operands should be a vector shuffle. + if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && + RHS.getOpcode() != ISD::VECTOR_SHUFFLE) + return false; + + EVT VT = LHS.getValueType(); + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for horizontal add/sub"); + + // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to + // operate independently on 128-bit lanes. + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts / NumLanes; + assert((NumLaneElts % 2 == 0) && + "Vector type should have an even number of elements in each lane"); + unsigned HalfLaneElts = NumLaneElts/2; + + // View LHS in the form + // LHS = VECTOR_SHUFFLE A, B, LMask + // If LHS is not a shuffle then pretend it is the shuffle + // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> + // NOTE: in what follows a default initialized SDValue represents an UNDEF of + // type VT. + SDValue A, B; + SmallVector<int, 16> LMask(NumElts); + if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { + if (LHS.getOperand(0).getOpcode() != ISD::UNDEF) + A = LHS.getOperand(0); + if (LHS.getOperand(1).getOpcode() != ISD::UNDEF) + B = LHS.getOperand(1); + ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(); + std::copy(Mask.begin(), Mask.end(), LMask.begin()); + } else { + if (LHS.getOpcode() != ISD::UNDEF) + A = LHS; + for (unsigned i = 0; i != NumElts; ++i) + LMask[i] = i; + } + + // Likewise, view RHS in the form + // RHS = VECTOR_SHUFFLE C, D, RMask + SDValue C, D; + SmallVector<int, 16> RMask(NumElts); + if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { + if (RHS.getOperand(0).getOpcode() != ISD::UNDEF) + C = RHS.getOperand(0); + if (RHS.getOperand(1).getOpcode() != ISD::UNDEF) + D = RHS.getOperand(1); + ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(); + std::copy(Mask.begin(), Mask.end(), RMask.begin()); + } else { + if (RHS.getOpcode() != ISD::UNDEF) + C = RHS; + for (unsigned i = 0; i != NumElts; ++i) + RMask[i] = i; + } + + // Check that the shuffles are both shuffling the same vectors. + if (!(A == C && B == D) && !(A == D && B == C)) + return false; + + // If everything is UNDEF then bail out: it would be better to fold to UNDEF. + if (!A.getNode() && !B.getNode()) + return false; + + // If A and B occur in reverse order in RHS, then "swap" them (which means + // rewriting the mask). + if (A != C) + CommuteVectorShuffleMask(RMask, NumElts); + + // At this point LHS and RHS are equivalent to + // LHS = VECTOR_SHUFFLE A, B, LMask + // RHS = VECTOR_SHUFFLE A, B, RMask + // Check that the masks correspond to performing a horizontal operation. + for (unsigned i = 0; i != NumElts; ++i) { + int LIdx = LMask[i], RIdx = RMask[i]; + + // Ignore any UNDEF components. + if (LIdx < 0 || RIdx < 0 || + (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || + (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) + continue; + + // Check that successive elements are being operated on. If not, this is + // not a horizontal operation. + unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs + unsigned LaneStart = (i/NumLaneElts) * NumLaneElts; + int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart; + if (!(LIdx == Index && RIdx == Index + 1) && + !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) + return false; + } + + LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. + RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. + return true; +} + +/// PerformFADDCombine - Do target-specific dag combines on floating point adds. +static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // Try to synthesize horizontal adds from adds of shuffles. + if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || + (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && + isHorizontalBinOp(LHS, RHS, true)) + return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS); + return SDValue(); +} + +/// PerformFSUBCombine - Do target-specific dag combines on floating point subs. +static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // Try to synthesize horizontal subs from subs of shuffles. + if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || + (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && + isHorizontalBinOp(LHS, RHS, false)) + return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS); + return SDValue(); +} + +/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and +/// X86ISD::FXOR nodes. +static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { + assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); + // F[X]OR(0.0, x) -> x + // F[X]OR(x, 0.0) -> x + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) + if (C->getValueAPF().isPosZero()) + return N->getOperand(1); + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) + if (C->getValueAPF().isPosZero()) + return N->getOperand(0); + return SDValue(); +} + +/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and +/// X86ISD::FMAX nodes. +static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { + assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); + + // Only perform optimizations if UnsafeMath is used. + if (!DAG.getTarget().Options.UnsafeFPMath) + return SDValue(); + + // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes + // into FMINC and FMAXC, which are Commutative operations. + unsigned NewOp = 0; + switch (N->getOpcode()) { + default: llvm_unreachable("unknown opcode"); + case X86ISD::FMIN: NewOp = X86ISD::FMINC; break; + case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break; + } + + return DAG.getNode(NewOp, N->getDebugLoc(), N->getValueType(0), + N->getOperand(0), N->getOperand(1)); +} + +/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. +static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { + // FAND(0.0, x) -> 0.0 + // FAND(x, 0.0) -> 0.0 + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) + if (C->getValueAPF().isPosZero()) + return N->getOperand(0); + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) + if (C->getValueAPF().isPosZero()) + return N->getOperand(1); + return SDValue(); +} + +static SDValue PerformBTCombine(SDNode *N, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + // BT ignores high bits in the bit index operand. + SDValue Op1 = N->getOperand(1); + if (Op1.hasOneUse()) { + unsigned BitWidth = Op1.getValueSizeInBits(); + APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || + TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) + DCI.CommitTargetLoweringOpt(TLO); + } + return SDValue(); +} + +static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { + SDValue Op = N->getOperand(0); + if (Op.getOpcode() == ISD::BITCAST) + Op = Op.getOperand(0); + EVT VT = N->getValueType(0), OpVT = Op.getValueType(); + if (Op.getOpcode() == X86ISD::VZEXT_LOAD && + VT.getVectorElementType().getSizeInBits() == + OpVT.getVectorElementType().getSizeInBits()) { + return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); + } + return SDValue(); +} + +static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (!VT.isVector()) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT ExtraVT = cast<VTSDNode>(N1)->getVT(); + DebugLoc dl = N->getDebugLoc(); + + // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the + // both SSE and AVX2 since there is no sign-extended shift right + // operation on a vector with 64-bit elements. + //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) -> + // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT))) + if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND || + N0.getOpcode() == ISD::SIGN_EXTEND)) { + SDValue N00 = N0.getOperand(0); + + // EXTLOAD has a better solution on AVX2, + // it may be replaced with X86ISD::VSEXT node. + if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256()) + if (!ISD::isNormalLoad(N00.getNode())) + return SDValue(); + + if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) { + SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, + N00, N1); + return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp); + } + } + return SDValue(); +} + +static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + if (!DCI.isBeforeLegalizeOps()) + return SDValue(); + + if (!Subtarget->hasFp256()) + return SDValue(); + + EVT VT = N->getValueType(0); + if (VT.isVector() && VT.getSizeInBits() == 256) { + SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); + if (R.getNode()) + return R; + } + + return SDValue(); +} + +static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget* Subtarget) { + DebugLoc dl = N->getDebugLoc(); + EVT VT = N->getValueType(0); + + // Let legalize expand this if it isn't a legal type yet. + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + EVT ScalarVT = VT.getScalarType(); + if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || + (!Subtarget->hasFMA() && !Subtarget->hasFMA4())) + return SDValue(); + + SDValue A = N->getOperand(0); + SDValue B = N->getOperand(1); + SDValue C = N->getOperand(2); + + bool NegA = (A.getOpcode() == ISD::FNEG); + bool NegB = (B.getOpcode() == ISD::FNEG); + bool NegC = (C.getOpcode() == ISD::FNEG); + + // Negative multiplication when NegA xor NegB + bool NegMul = (NegA != NegB); + if (NegA) + A = A.getOperand(0); + if (NegB) + B = B.getOperand(0); + if (NegC) + C = C.getOperand(0); + + unsigned Opcode; + if (!NegMul) + Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; + else + Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; + + return DAG.getNode(Opcode, dl, VT, A, B, C); +} + +static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> + // (and (i32 x86isd::setcc_carry), 1) + // This eliminates the zext. This transformation is necessary because + // ISD::SETCC is always legalized to i8. + DebugLoc dl = N->getDebugLoc(); + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + if (N0.getOpcode() == ISD::AND && + N0.hasOneUse() && + N0.getOperand(0).hasOneUse()) { + SDValue N00 = N0.getOperand(0); + if (N00.getOpcode() == X86ISD::SETCC_CARRY) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); + if (!C || C->getZExtValue() != 1) + return SDValue(); + return DAG.getNode(ISD::AND, dl, VT, + DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, + N00.getOperand(0), N00.getOperand(1)), + DAG.getConstant(1, VT)); + } + } + + if (VT.is256BitVector()) { + SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); + if (R.getNode()) + return R; + } + + return SDValue(); +} + +// Optimize x == -y --> x+y == 0 +// x != -y --> x+y != 0 +static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) { + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0))) + if (C->getAPIntValue() == 0 && LHS.hasOneUse()) { + SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(), + LHS.getValueType(), RHS, LHS.getOperand(1)); + return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0), + addV, DAG.getConstant(0, addV.getValueType()), CC); + } + if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0))) + if (C->getAPIntValue() == 0 && RHS.hasOneUse()) { + SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(), + RHS.getValueType(), LHS, RHS.getOperand(1)); + return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0), + addV, DAG.getConstant(0, addV.getValueType()), CC); + } + return SDValue(); +} + +// Helper function of PerformSETCCCombine. It is to materialize "setb reg" +// as "sbb reg,reg", since it can be extended without zext and produces +// an all-ones bit which is more useful than 0/1 in some cases. +static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) { + return DAG.getNode(ISD::AND, DL, MVT::i8, + DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, + DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS), + DAG.getConstant(1, MVT::i8)); +} + +// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT +static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + DebugLoc DL = N->getDebugLoc(); + X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); + SDValue EFLAGS = N->getOperand(1); + + if (CC == X86::COND_A) { + // Try to convert COND_A into COND_B in an attempt to facilitate + // materializing "setb reg". + // + // Do not flip "e > c", where "c" is a constant, because Cmp instruction + // cannot take an immediate as its first operand. + // + if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && + EFLAGS.getValueType().isInteger() && + !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { + SDValue NewSub = DAG.getNode(X86ISD::SUB, EFLAGS.getDebugLoc(), + EFLAGS.getNode()->getVTList(), + EFLAGS.getOperand(1), EFLAGS.getOperand(0)); + SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); + return MaterializeSETB(DL, NewEFLAGS, DAG); + } + } + + // Materialize "setb reg" as "sbb reg,reg", since it can be extended without + // a zext and produces an all-ones bit which is more useful than 0/1 in some + // cases. + if (CC == X86::COND_B) + return MaterializeSETB(DL, EFLAGS, DAG); + + SDValue Flags; + + Flags = checkBoolTestSetCCCombine(EFLAGS, CC); + if (Flags.getNode()) { + SDValue Cond = DAG.getConstant(CC, MVT::i8); + return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); + } + + return SDValue(); +} + +// Optimize branch condition evaluation. +// +static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + DebugLoc DL = N->getDebugLoc(); + SDValue Chain = N->getOperand(0); + SDValue Dest = N->getOperand(1); + SDValue EFLAGS = N->getOperand(3); + X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2)); + + SDValue Flags; + + Flags = checkBoolTestSetCCCombine(EFLAGS, CC); + if (Flags.getNode()) { + SDValue Cond = DAG.getConstant(CC, MVT::i8); + return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, + Flags); + } + + return SDValue(); +} + +static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, + const X86TargetLowering *XTLI) { + SDValue Op0 = N->getOperand(0); + EVT InVT = Op0->getValueType(0); + + // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32)) + if (InVT == MVT::v8i8 || InVT == MVT::v4i8) { + DebugLoc dl = N->getDebugLoc(); + MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; + SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); + return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); + } + + // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have + // a 32-bit target where SSE doesn't support i64->FP operations. + if (Op0.getOpcode() == ISD::LOAD) { + LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); + EVT VT = Ld->getValueType(0); + if (!Ld->isVolatile() && !N->getValueType(0).isVector() && + ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && + !XTLI->getSubtarget()->is64Bit() && + !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { + SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), + Ld->getChain(), Op0, DAG); + DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); + return FILDChain; + } + } + return SDValue(); +} + +// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS +static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, + X86TargetLowering::DAGCombinerInfo &DCI) { + // If the LHS and RHS of the ADC node are zero, then it can't overflow and + // the result is either zero or one (depending on the input carry bit). + // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. + if (X86::isZeroNode(N->getOperand(0)) && + X86::isZeroNode(N->getOperand(1)) && + // We don't have a good way to replace an EFLAGS use, so only do this when + // dead right now. + SDValue(N, 1).use_empty()) { + DebugLoc DL = N->getDebugLoc(); + EVT VT = N->getValueType(0); + SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); + SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, + DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, + DAG.getConstant(X86::COND_B,MVT::i8), + N->getOperand(2)), + DAG.getConstant(1, VT)); + return DCI.CombineTo(N, Res1, CarryOut); + } + + return SDValue(); +} + +// fold (add Y, (sete X, 0)) -> adc 0, Y +// (add Y, (setne X, 0)) -> sbb -1, Y +// (sub (sete X, 0), Y) -> sbb 0, Y +// (sub (setne X, 0), Y) -> adc -1, Y +static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { + DebugLoc DL = N->getDebugLoc(); + + // Look through ZExts. + SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); + if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) + return SDValue(); + + SDValue SetCC = Ext.getOperand(0); + if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) + return SDValue(); + + X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); + if (CC != X86::COND_E && CC != X86::COND_NE) + return SDValue(); + + SDValue Cmp = SetCC.getOperand(1); + if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || + !X86::isZeroNode(Cmp.getOperand(1)) || + !Cmp.getOperand(0).getValueType().isInteger()) + return SDValue(); + + SDValue CmpOp0 = Cmp.getOperand(0); + SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, + DAG.getConstant(1, CmpOp0.getValueType())); + + SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); + if (CC == X86::COND_NE) + return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, + DL, OtherVal.getValueType(), OtherVal, + DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); + return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, + DL, OtherVal.getValueType(), OtherVal, + DAG.getConstant(0, OtherVal.getValueType()), NewCmp); +} + +/// PerformADDCombine - Do target-specific dag combines on integer adds. +static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + // Try to synthesize horizontal adds from adds of shuffles. + if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || + (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && + isHorizontalBinOp(Op0, Op1, true)) + return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1); + + return OptimizeConditionalInDecrement(N, DAG); +} + +static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + // X86 can't encode an immediate LHS of a sub. See if we can push the + // negation into a preceding instruction. + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) { + // If the RHS of the sub is a XOR with one use and a constant, invert the + // immediate. Then add one to the LHS of the sub so we can turn + // X-Y -> X+~Y+1, saving one register. + if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && + isa<ConstantSDNode>(Op1.getOperand(1))) { + APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue(); + EVT VT = Op0.getValueType(); + SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT, + Op1.getOperand(0), + DAG.getConstant(~XorC, VT)); + return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor, + DAG.getConstant(C->getAPIntValue()+1, VT)); + } + } + + // Try to synthesize horizontal adds from adds of shuffles. + EVT VT = N->getValueType(0); + if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || + (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && + isHorizontalBinOp(Op0, Op1, true)) + return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1); + + return OptimizeConditionalInDecrement(N, DAG); +} + +/// performVZEXTCombine - Performs build vector combines +static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + // (vzext (bitcast (vzext (x)) -> (vzext x) + SDValue In = N->getOperand(0); + while (In.getOpcode() == ISD::BITCAST) + In = In.getOperand(0); + + if (In.getOpcode() != X86ISD::VZEXT) + return SDValue(); + + return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0), + In.getOperand(0)); +} + +SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + switch (N->getOpcode()) { + default: break; + case ISD::EXTRACT_VECTOR_ELT: + return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); + case ISD::VSELECT: + case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget); + case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); + case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); + case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); + case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); + case ISD::MUL: return PerformMulCombine(N, DAG, DCI); + case ISD::SHL: + case ISD::SRA: + case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget); + case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); + case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); + case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); + case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget); + case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); + case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); + case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); + case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); + case X86ISD::FXOR: + case X86ISD::FOR: return PerformFORCombine(N, DAG); + case X86ISD::FMIN: + case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); + case X86ISD::FAND: return PerformFANDCombine(N, DAG); + case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); + case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); + case ISD::ANY_EXTEND: + case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); + case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); + case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget); + case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget); + case ISD::SETCC: return PerformISDSETCCCombine(N, DAG); + case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); + case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); + case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); + case X86ISD::SHUFP: // Handle all target specific shuffles + case X86ISD::PALIGNR: + case X86ISD::UNPCKH: + case X86ISD::UNPCKL: + case X86ISD::MOVHLPS: + case X86ISD::MOVLHPS: + case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::MOVSS: + case X86ISD::MOVSD: + case X86ISD::VPERMILP: + case X86ISD::VPERM2X128: + case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); + case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); + } + + return SDValue(); +} + +/// isTypeDesirableForOp - Return true if the target has native support for +/// the specified value type and it is 'desirable' to use the type for the +/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 +/// instruction encodings are longer and some i16 instructions are slow. +bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { + if (!isTypeLegal(VT)) + return false; + if (VT != MVT::i16) + return true; + + switch (Opc) { + default: + return true; + case ISD::LOAD: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + case ISD::SHL: + case ISD::SRL: + case ISD::SUB: + case ISD::ADD: + case ISD::MUL: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + return false; + } +} + +/// IsDesirableToPromoteOp - This method query the target whether it is +/// beneficial for dag combiner to promote the specified node. If true, it +/// should return the desired promotion type by reference. +bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { + EVT VT = Op.getValueType(); + if (VT != MVT::i16) + return false; + + bool Promote = false; + bool Commute = false; + switch (Op.getOpcode()) { + default: break; + case ISD::LOAD: { + LoadSDNode *LD = cast<LoadSDNode>(Op); + // If the non-extending load has a single use and it's not live out, then it + // might be folded. + if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& + Op.hasOneUse()*/) { + for (SDNode::use_iterator UI = Op.getNode()->use_begin(), + UE = Op.getNode()->use_end(); UI != UE; ++UI) { + // The only case where we'd want to promote LOAD (rather then it being + // promoted as an operand is when it's only use is liveout. + if (UI->getOpcode() != ISD::CopyToReg) + return false; + } + } + Promote = true; + break; + } + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + Promote = true; + break; + case ISD::SHL: + case ISD::SRL: { + SDValue N0 = Op.getOperand(0); + // Look out for (store (shl (load), x)). + if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) + return false; + Promote = true; + break; + } + case ISD::ADD: + case ISD::MUL: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + Commute = true; + // fallthrough + case ISD::SUB: { + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + if (!Commute && MayFoldLoad(N1)) + return false; + // Avoid disabling potential load folding opportunities. + if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) + return false; + if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) + return false; + Promote = true; + } + } + + PVT = MVT::i32; + return Promote; +} + +//===----------------------------------------------------------------------===// +// X86 Inline Assembly Support +//===----------------------------------------------------------------------===// + +namespace { + // Helper to match a string separated by whitespace. + bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) { + s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace. + + for (unsigned i = 0, e = args.size(); i != e; ++i) { + StringRef piece(*args[i]); + if (!s.startswith(piece)) // Check if the piece matches. + return false; + + s = s.substr(piece.size()); + StringRef::size_type pos = s.find_first_not_of(" \t"); + if (pos == 0) // We matched a prefix. + return false; + + s = s.substr(pos); + } + + return s.empty(); + } + const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={}; +} + +bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { + InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); + + std::string AsmStr = IA->getAsmString(); + + IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); + if (!Ty || Ty->getBitWidth() % 16 != 0) + return false; + + // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" + SmallVector<StringRef, 4> AsmPieces; + SplitString(AsmStr, AsmPieces, ";\n"); + + switch (AsmPieces.size()) { + default: return false; + case 1: + // FIXME: this should verify that we are targeting a 486 or better. If not, + // we will turn this bswap into something that will be lowered to logical + // ops instead of emitting the bswap asm. For now, we don't support 486 or + // lower so don't worry about this. + // bswap $0 + if (matchAsm(AsmPieces[0], "bswap", "$0") || + matchAsm(AsmPieces[0], "bswapl", "$0") || + matchAsm(AsmPieces[0], "bswapq", "$0") || + matchAsm(AsmPieces[0], "bswap", "${0:q}") || + matchAsm(AsmPieces[0], "bswapl", "${0:q}") || + matchAsm(AsmPieces[0], "bswapq", "${0:q}")) { + // No need to check constraints, nothing other than the equivalent of + // "=r,0" would be valid here. + return IntrinsicLowering::LowerToByteSwap(CI); + } + + // rorw $$8, ${0:w} --> llvm.bswap.i16 + if (CI->getType()->isIntegerTy(16) && + IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && + (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") || + matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) { + AsmPieces.clear(); + const std::string &ConstraintsStr = IA->getConstraintString(); + SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); + array_pod_sort(AsmPieces.begin(), AsmPieces.end()); + if (AsmPieces.size() == 4 && + AsmPieces[0] == "~{cc}" && + AsmPieces[1] == "~{dirflag}" && + AsmPieces[2] == "~{flags}" && + AsmPieces[3] == "~{fpsr}") + return IntrinsicLowering::LowerToByteSwap(CI); + } + break; + case 3: + if (CI->getType()->isIntegerTy(32) && + IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && + matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") && + matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") && + matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) { + AsmPieces.clear(); + const std::string &ConstraintsStr = IA->getConstraintString(); + SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); + array_pod_sort(AsmPieces.begin(), AsmPieces.end()); + if (AsmPieces.size() == 4 && + AsmPieces[0] == "~{cc}" && + AsmPieces[1] == "~{dirflag}" && + AsmPieces[2] == "~{flags}" && + AsmPieces[3] == "~{fpsr}") + return IntrinsicLowering::LowerToByteSwap(CI); + } + + if (CI->getType()->isIntegerTy(64)) { + InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); + if (Constraints.size() >= 2 && + Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && + Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { + // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 + if (matchAsm(AsmPieces[0], "bswap", "%eax") && + matchAsm(AsmPieces[1], "bswap", "%edx") && + matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx")) + return IntrinsicLowering::LowerToByteSwap(CI); + } + } + break; + } + return false; +} + +/// getConstraintType - Given a constraint letter, return the type of +/// constraint it is for this target. +X86TargetLowering::ConstraintType +X86TargetLowering::getConstraintType(const std::string &Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + case 'R': + case 'q': + case 'Q': + case 'f': + case 't': + case 'u': + case 'y': + case 'x': + case 'Y': + case 'l': + return C_RegisterClass; + case 'a': + case 'b': + case 'c': + case 'd': + case 'S': + case 'D': + case 'A': + return C_Register; + case 'I': + case 'J': + case 'K': + case 'L': + case 'M': + case 'N': + case 'G': + case 'C': + case 'e': + case 'Z': + return C_Other; + default: + break; + } + } + return TargetLowering::getConstraintType(Constraint); +} + +/// Examine constraint type and operand type and determine a weight value. +/// This object must already have been set up with the operand type +/// and the current alternative constraint selected. +TargetLowering::ConstraintWeight + X86TargetLowering::getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const { + ConstraintWeight weight = CW_Invalid; + Value *CallOperandVal = info.CallOperandVal; + // If we don't have a value, we can't do a match, + // but allow it at the lowest weight. + if (CallOperandVal == NULL) + return CW_Default; + Type *type = CallOperandVal->getType(); + // Look at the constraint type. + switch (*constraint) { + default: + weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); + case 'R': + case 'q': + case 'Q': + case 'a': + case 'b': + case 'c': + case 'd': + case 'S': + case 'D': + case 'A': + if (CallOperandVal->getType()->isIntegerTy()) + weight = CW_SpecificReg; + break; + case 'f': + case 't': + case 'u': + if (type->isFloatingPointTy()) + weight = CW_SpecificReg; + break; + case 'y': + if (type->isX86_MMXTy() && Subtarget->hasMMX()) + weight = CW_SpecificReg; + break; + case 'x': + case 'Y': + if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) || + ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256())) + weight = CW_Register; + break; + case 'I': + if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { + if (C->getZExtValue() <= 31) + weight = CW_Constant; + } + break; + case 'J': + if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { + if (C->getZExtValue() <= 63) + weight = CW_Constant; + } + break; + case 'K': + if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { + if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) + weight = CW_Constant; + } + break; + case 'L': + if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { + if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) + weight = CW_Constant; + } + break; + case 'M': + if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { + if (C->getZExtValue() <= 3) + weight = CW_Constant; + } + break; + case 'N': + if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { + if (C->getZExtValue() <= 0xff) + weight = CW_Constant; + } + break; + case 'G': + case 'C': + if (dyn_cast<ConstantFP>(CallOperandVal)) { + weight = CW_Constant; + } + break; + case 'e': + if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { + if ((C->getSExtValue() >= -0x80000000LL) && + (C->getSExtValue() <= 0x7fffffffLL)) + weight = CW_Constant; + } + break; + case 'Z': + if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { + if (C->getZExtValue() <= 0xffffffff) + weight = CW_Constant; + } + break; + } + return weight; +} + +/// LowerXConstraint - try to replace an X constraint, which matches anything, +/// with another that has more specific requirements based on the type of the +/// corresponding operand. +const char *X86TargetLowering:: +LowerXConstraint(EVT ConstraintVT) const { + // FP X constraints get lowered to SSE1/2 registers if available, otherwise + // 'f' like normal targets. + if (ConstraintVT.isFloatingPoint()) { + if (Subtarget->hasSSE2()) + return "Y"; + if (Subtarget->hasSSE1()) + return "x"; + } + + return TargetLowering::LowerXConstraint(ConstraintVT); +} + +/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops +/// vector. If it is invalid, don't add anything to Ops. +void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, + std::string &Constraint, + std::vector<SDValue>&Ops, + SelectionDAG &DAG) const { + SDValue Result(0, 0); + + // Only support length 1 constraints for now. + if (Constraint.length() > 1) return; + + char ConstraintLetter = Constraint[0]; + switch (ConstraintLetter) { + default: break; + case 'I': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (C->getZExtValue() <= 31) { + Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + break; + } + } + return; + case 'J': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (C->getZExtValue() <= 63) { + Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + break; + } + } + return; + case 'K': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (isInt<8>(C->getSExtValue())) { + Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + break; + } + } + return; + case 'N': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (C->getZExtValue() <= 255) { + Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + break; + } + } + return; + case 'e': { + // 32-bit signed value + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), + C->getSExtValue())) { + // Widen to 64 bits here to get it sign extended. + Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); + break; + } + // FIXME gcc accepts some relocatable values here too, but only in certain + // memory models; it's complicated. + } + return; + } + case 'Z': { + // 32-bit unsigned value + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), + C->getZExtValue())) { + Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + break; + } + } + // FIXME gcc accepts some relocatable values here too, but only in certain + // memory models; it's complicated. + return; + } + case 'i': { + // Literal immediates are always ok. + if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { + // Widen to 64 bits here to get it sign extended. + Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); + break; + } + + // In any sort of PIC mode addresses need to be computed at runtime by + // adding in a register or some sort of table lookup. These can't + // be used as immediates. + if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) + return; + + // If we are in non-pic codegen mode, we allow the address of a global (with + // an optional displacement) to be used with 'i'. + GlobalAddressSDNode *GA = 0; + int64_t Offset = 0; + + // Match either (GA), (GA+C), (GA+C1+C2), etc. + while (1) { + if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { + Offset += GA->getOffset(); + break; + } else if (Op.getOpcode() == ISD::ADD) { + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + Offset += C->getZExtValue(); + Op = Op.getOperand(0); + continue; + } + } else if (Op.getOpcode() == ISD::SUB) { + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + Offset += -C->getZExtValue(); + Op = Op.getOperand(0); + continue; + } + } + + // Otherwise, this isn't something we can handle, reject it. + return; + } + + const GlobalValue *GV = GA->getGlobal(); + // If we require an extra load to get this address, as in PIC mode, we + // can't accept it. + if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, + getTargetMachine()))) + return; + + Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), + GA->getValueType(0), Offset); + break; + } + } + + if (Result.getNode()) { + Ops.push_back(Result); + return; + } + return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); +} + +std::pair<unsigned, const TargetRegisterClass*> +X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, + EVT VT) const { + // First, see if this is a constraint that directly corresponds to an LLVM + // register class. + if (Constraint.size() == 1) { + // GCC Constraint Letters + switch (Constraint[0]) { + default: break; + // TODO: Slight differences here in allocation order and leaving + // RIP in the class. Do they matter any more here than they do + // in the normal allocation? + case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. + if (Subtarget->is64Bit()) { + if (VT == MVT::i32 || VT == MVT::f32) + return std::make_pair(0U, &X86::GR32RegClass); + if (VT == MVT::i16) + return std::make_pair(0U, &X86::GR16RegClass); + if (VT == MVT::i8 || VT == MVT::i1) + return std::make_pair(0U, &X86::GR8RegClass); + if (VT == MVT::i64 || VT == MVT::f64) + return std::make_pair(0U, &X86::GR64RegClass); + break; + } + // 32-bit fallthrough + case 'Q': // Q_REGS + if (VT == MVT::i32 || VT == MVT::f32) + return std::make_pair(0U, &X86::GR32_ABCDRegClass); + if (VT == MVT::i16) + return std::make_pair(0U, &X86::GR16_ABCDRegClass); + if (VT == MVT::i8 || VT == MVT::i1) + return std::make_pair(0U, &X86::GR8_ABCD_LRegClass); + if (VT == MVT::i64) + return std::make_pair(0U, &X86::GR64_ABCDRegClass); + break; + case 'r': // GENERAL_REGS + case 'l': // INDEX_REGS + if (VT == MVT::i8 || VT == MVT::i1) + return std::make_pair(0U, &X86::GR8RegClass); + if (VT == MVT::i16) + return std::make_pair(0U, &X86::GR16RegClass); + if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) + return std::make_pair(0U, &X86::GR32RegClass); + return std::make_pair(0U, &X86::GR64RegClass); + case 'R': // LEGACY_REGS + if (VT == MVT::i8 || VT == MVT::i1) + return std::make_pair(0U, &X86::GR8_NOREXRegClass); + if (VT == MVT::i16) + return std::make_pair(0U, &X86::GR16_NOREXRegClass); + if (VT == MVT::i32 || !Subtarget->is64Bit()) + return std::make_pair(0U, &X86::GR32_NOREXRegClass); + return std::make_pair(0U, &X86::GR64_NOREXRegClass); + case 'f': // FP Stack registers. + // If SSE is enabled for this VT, use f80 to ensure the isel moves the + // value to the correct fpstack register class. + if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) + return std::make_pair(0U, &X86::RFP32RegClass); + if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) + return std::make_pair(0U, &X86::RFP64RegClass); + return std::make_pair(0U, &X86::RFP80RegClass); + case 'y': // MMX_REGS if MMX allowed. + if (!Subtarget->hasMMX()) break; + return std::make_pair(0U, &X86::VR64RegClass); + case 'Y': // SSE_REGS if SSE2 allowed + if (!Subtarget->hasSSE2()) break; + // FALL THROUGH. + case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed + if (!Subtarget->hasSSE1()) break; + + switch (VT.getSimpleVT().SimpleTy) { + default: break; + // Scalar SSE types. + case MVT::f32: + case MVT::i32: + return std::make_pair(0U, &X86::FR32RegClass); + case MVT::f64: + case MVT::i64: + return std::make_pair(0U, &X86::FR64RegClass); + // Vector types. + case MVT::v16i8: + case MVT::v8i16: + case MVT::v4i32: + case MVT::v2i64: + case MVT::v4f32: + case MVT::v2f64: + return std::make_pair(0U, &X86::VR128RegClass); + // AVX types. + case MVT::v32i8: + case MVT::v16i16: + case MVT::v8i32: + case MVT::v4i64: + case MVT::v8f32: + case MVT::v4f64: + return std::make_pair(0U, &X86::VR256RegClass); + } + break; + } + } + + // Use the default implementation in TargetLowering to convert the register + // constraint into a member of a register class. + std::pair<unsigned, const TargetRegisterClass*> Res; + Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); + + // Not found as a standard register? + if (Res.second == 0) { + // Map st(0) -> st(7) -> ST0 + if (Constraint.size() == 7 && Constraint[0] == '{' && + tolower(Constraint[1]) == 's' && + tolower(Constraint[2]) == 't' && + Constraint[3] == '(' && + (Constraint[4] >= '0' && Constraint[4] <= '7') && + Constraint[5] == ')' && + Constraint[6] == '}') { + + Res.first = X86::ST0+Constraint[4]-'0'; + Res.second = &X86::RFP80RegClass; + return Res; + } + + // GCC allows "st(0)" to be called just plain "st". + if (StringRef("{st}").equals_lower(Constraint)) { + Res.first = X86::ST0; + Res.second = &X86::RFP80RegClass; + return Res; + } + + // flags -> EFLAGS + if (StringRef("{flags}").equals_lower(Constraint)) { + Res.first = X86::EFLAGS; + Res.second = &X86::CCRRegClass; + return Res; + } + + // 'A' means EAX + EDX. + if (Constraint == "A") { + Res.first = X86::EAX; + Res.second = &X86::GR32_ADRegClass; + return Res; + } + return Res; + } + + // Otherwise, check to see if this is a register class of the wrong value + // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to + // turn into {ax},{dx}. + if (Res.second->hasType(VT)) + return Res; // Correct type already, nothing to do. + + // All of the single-register GCC register classes map their values onto + // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we + // really want an 8-bit or 32-bit register, map to the appropriate register + // class and return the appropriate register. + if (Res.second == &X86::GR16RegClass) { + if (VT == MVT::i8 || VT == MVT::i1) { + unsigned DestReg = 0; + switch (Res.first) { + default: break; + case X86::AX: DestReg = X86::AL; break; + case X86::DX: DestReg = X86::DL; break; + case X86::CX: DestReg = X86::CL; break; + case X86::BX: DestReg = X86::BL; break; + } + if (DestReg) { + Res.first = DestReg; + Res.second = &X86::GR8RegClass; + } + } else if (VT == MVT::i32 || VT == MVT::f32) { + unsigned DestReg = 0; + switch (Res.first) { + default: break; + case X86::AX: DestReg = X86::EAX; break; + case X86::DX: DestReg = X86::EDX; break; + case X86::CX: DestReg = X86::ECX; break; + case X86::BX: DestReg = X86::EBX; break; + case X86::SI: DestReg = X86::ESI; break; + case X86::DI: DestReg = X86::EDI; break; + case X86::BP: DestReg = X86::EBP; break; + case X86::SP: DestReg = X86::ESP; break; + } + if (DestReg) { + Res.first = DestReg; + Res.second = &X86::GR32RegClass; + } + } else if (VT == MVT::i64 || VT == MVT::f64) { + unsigned DestReg = 0; + switch (Res.first) { + default: break; + case X86::AX: DestReg = X86::RAX; break; + case X86::DX: DestReg = X86::RDX; break; + case X86::CX: DestReg = X86::RCX; break; + case X86::BX: DestReg = X86::RBX; break; + case X86::SI: DestReg = X86::RSI; break; + case X86::DI: DestReg = X86::RDI; break; + case X86::BP: DestReg = X86::RBP; break; + case X86::SP: DestReg = X86::RSP; break; + } + if (DestReg) { + Res.first = DestReg; + Res.second = &X86::GR64RegClass; + } + } + } else if (Res.second == &X86::FR32RegClass || + Res.second == &X86::FR64RegClass || + Res.second == &X86::VR128RegClass) { + // Handle references to XMM physical registers that got mapped into the + // wrong class. This can happen with constraints like {xmm0} where the + // target independent register mapper will just pick the first match it can + // find, ignoring the required type. + + if (VT == MVT::f32 || VT == MVT::i32) + Res.second = &X86::FR32RegClass; + else if (VT == MVT::f64 || VT == MVT::i64) + Res.second = &X86::FR64RegClass; + else if (X86::VR128RegClass.hasType(VT)) + Res.second = &X86::VR128RegClass; + else if (X86::VR256RegClass.hasType(VT)) + Res.second = &X86::VR256RegClass; + } + + return Res; +} diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h new file mode 100644 index 0000000..2727e22 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h @@ -0,0 +1,957 @@ +//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that X86 uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#ifndef X86ISELLOWERING_H +#define X86ISELLOWERING_H + +#include "X86MachineFunctionInfo.h" +#include "X86RegisterInfo.h" +#include "X86Subtarget.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/FastISel.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetOptions.h" + +namespace llvm { + namespace X86ISD { + // X86 Specific DAG Nodes + enum NodeType { + // Start the numbering where the builtin ops leave off. + FIRST_NUMBER = ISD::BUILTIN_OP_END, + + /// BSF - Bit scan forward. + /// BSR - Bit scan reverse. + BSF, + BSR, + + /// SHLD, SHRD - Double shift instructions. These correspond to + /// X86::SHLDxx and X86::SHRDxx instructions. + SHLD, + SHRD, + + /// FAND - Bitwise logical AND of floating point values. This corresponds + /// to X86::ANDPS or X86::ANDPD. + FAND, + + /// FOR - Bitwise logical OR of floating point values. This corresponds + /// to X86::ORPS or X86::ORPD. + FOR, + + /// FXOR - Bitwise logical XOR of floating point values. This corresponds + /// to X86::XORPS or X86::XORPD. + FXOR, + + /// FSRL - Bitwise logical right shift of floating point values. These + /// corresponds to X86::PSRLDQ. + FSRL, + + /// CALL - These operations represent an abstract X86 call + /// instruction, which includes a bunch of information. In particular the + /// operands of these node are: + /// + /// #0 - The incoming token chain + /// #1 - The callee + /// #2 - The number of arg bytes the caller pushes on the stack. + /// #3 - The number of arg bytes the callee pops off the stack. + /// #4 - The value to pass in AL/AX/EAX (optional) + /// #5 - The value to pass in DL/DX/EDX (optional) + /// + /// The result values of these nodes are: + /// + /// #0 - The outgoing token chain + /// #1 - The first register result value (optional) + /// #2 - The second register result value (optional) + /// + CALL, + + /// RDTSC_DAG - This operation implements the lowering for + /// readcyclecounter + RDTSC_DAG, + + /// X86 compare and logical compare instructions. + CMP, COMI, UCOMI, + + /// X86 bit-test instructions. + BT, + + /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS + /// operand, usually produced by a CMP instruction. + SETCC, + + // Same as SETCC except it's materialized with a sbb and the value is all + // one's or all zero's. + SETCC_CARRY, // R = carry_bit ? ~0 : 0 + + /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD. + /// Operands are two FP values to compare; result is a mask of + /// 0s or 1s. Generally DTRT for C/C++ with NaNs. + FSETCCss, FSETCCsd, + + /// X86 MOVMSK{pd|ps}, extracts sign bits of two or four FP values, + /// result in an integer GPR. Needs masking for scalar result. + FGETSIGNx86, + + /// X86 conditional moves. Operand 0 and operand 1 are the two values + /// to select from. Operand 2 is the condition code, and operand 3 is the + /// flag operand produced by a CMP or TEST instruction. It also writes a + /// flag result. + CMOV, + + /// X86 conditional branches. Operand 0 is the chain operand, operand 1 + /// is the block to branch if condition is true, operand 2 is the + /// condition code, and operand 3 is the flag operand produced by a CMP + /// or TEST instruction. + BRCOND, + + /// Return with a flag operand. Operand 0 is the chain operand, operand + /// 1 is the number of bytes of stack to pop. + RET_FLAG, + + /// REP_STOS - Repeat fill, corresponds to X86::REP_STOSx. + REP_STOS, + + /// REP_MOVS - Repeat move, corresponds to X86::REP_MOVSx. + REP_MOVS, + + /// GlobalBaseReg - On Darwin, this node represents the result of the popl + /// at function entry, used for PIC code. + GlobalBaseReg, + + /// Wrapper - A wrapper node for TargetConstantPool, + /// TargetExternalSymbol, and TargetGlobalAddress. + Wrapper, + + /// WrapperRIP - Special wrapper used under X86-64 PIC mode for RIP + /// relative displacements. + WrapperRIP, + + /// MOVDQ2Q - Copies a 64-bit value from the low word of an XMM vector + /// to an MMX vector. If you think this is too close to the previous + /// mnemonic, so do I; blame Intel. + MOVDQ2Q, + + /// MMX_MOVD2W - Copies a 32-bit value from the low word of a MMX + /// vector to a GPR. + MMX_MOVD2W, + + /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to + /// i32, corresponds to X86::PEXTRB. + PEXTRB, + + /// PEXTRW - Extract a 16-bit value from a vector and zero extend it to + /// i32, corresponds to X86::PEXTRW. + PEXTRW, + + /// INSERTPS - Insert any element of a 4 x float vector into any element + /// of a destination 4 x floatvector. + INSERTPS, + + /// PINSRB - Insert the lower 8-bits of a 32-bit value to a vector, + /// corresponds to X86::PINSRB. + PINSRB, + + /// PINSRW - Insert the lower 16-bits of a 32-bit value to a vector, + /// corresponds to X86::PINSRW. + PINSRW, MMX_PINSRW, + + /// PSHUFB - Shuffle 16 8-bit values within a vector. + PSHUFB, + + /// ANDNP - Bitwise Logical AND NOT of Packed FP values. + ANDNP, + + /// PSIGN - Copy integer sign. + PSIGN, + + /// BLENDV - Blend where the selector is a register. + BLENDV, + + /// BLENDI - Blend where the selector is an immediate. + BLENDI, + + // SUBUS - Integer sub with unsigned saturation. + SUBUS, + + /// HADD - Integer horizontal add. + HADD, + + /// HSUB - Integer horizontal sub. + HSUB, + + /// FHADD - Floating point horizontal add. + FHADD, + + /// FHSUB - Floating point horizontal sub. + FHSUB, + + /// UMAX, UMIN - Unsigned integer max and min. + UMAX, UMIN, + + /// SMAX, SMIN - Signed integer max and min. + SMAX, SMIN, + + /// FMAX, FMIN - Floating point max and min. + /// + FMAX, FMIN, + + /// FMAXC, FMINC - Commutative FMIN and FMAX. + FMAXC, FMINC, + + /// FRSQRT, FRCP - Floating point reciprocal-sqrt and reciprocal + /// approximation. Note that these typically require refinement + /// in order to obtain suitable precision. + FRSQRT, FRCP, + + // TLSADDR - Thread Local Storage. + TLSADDR, + + // TLSBASEADDR - Thread Local Storage. A call to get the start address + // of the TLS block for the current module. + TLSBASEADDR, + + // TLSCALL - Thread Local Storage. When calling to an OS provided + // thunk at the address from an earlier relocation. + TLSCALL, + + // EH_RETURN - Exception Handling helpers. + EH_RETURN, + + // EH_SJLJ_SETJMP - SjLj exception handling setjmp. + EH_SJLJ_SETJMP, + + // EH_SJLJ_LONGJMP - SjLj exception handling longjmp. + EH_SJLJ_LONGJMP, + + /// TC_RETURN - Tail call return. See X86TargetLowering::LowerCall for + /// the list of operands. + TC_RETURN, + + // VZEXT_MOVL - Vector move low and zero extend. + VZEXT_MOVL, + + // VSEXT_MOVL - Vector move low and sign extend. + VSEXT_MOVL, + + // VZEXT - Vector integer zero-extend. + VZEXT, + + // VSEXT - Vector integer signed-extend. + VSEXT, + + // VFPEXT - Vector FP extend. + VFPEXT, + + // VFPROUND - Vector FP round. + VFPROUND, + + // VSHL, VSRL - 128-bit vector logical left / right shift + VSHLDQ, VSRLDQ, + + // VSHL, VSRL, VSRA - Vector shift elements + VSHL, VSRL, VSRA, + + // VSHLI, VSRLI, VSRAI - Vector shift elements by immediate + VSHLI, VSRLI, VSRAI, + + // CMPP - Vector packed double/float comparison. + CMPP, + + // PCMP* - Vector integer comparisons. + PCMPEQ, PCMPGT, + + // ADD, SUB, SMUL, etc. - Arithmetic operations with FLAGS results. + ADD, SUB, ADC, SBB, SMUL, + INC, DEC, OR, XOR, AND, + + BLSI, // BLSI - Extract lowest set isolated bit + BLSMSK, // BLSMSK - Get mask up to lowest set bit + BLSR, // BLSR - Reset lowest set bit + + UMUL, // LOW, HI, FLAGS = umul LHS, RHS + + // MUL_IMM - X86 specific multiply by immediate. + MUL_IMM, + + // PTEST - Vector bitwise comparisons + PTEST, + + // TESTP - Vector packed fp sign bitwise comparisons + TESTP, + + // Several flavors of instructions with vector shuffle behaviors. + PALIGNR, + PSHUFD, + PSHUFHW, + PSHUFLW, + SHUFP, + MOVDDUP, + MOVSHDUP, + MOVSLDUP, + MOVLHPS, + MOVLHPD, + MOVHLPS, + MOVLPS, + MOVLPD, + MOVSD, + MOVSS, + UNPCKL, + UNPCKH, + VPERMILP, + VPERMV, + VPERMI, + VPERM2X128, + VBROADCAST, + + // PMULUDQ - Vector multiply packed unsigned doubleword integers + PMULUDQ, + + // FMA nodes + FMADD, + FNMADD, + FMSUB, + FNMSUB, + FMADDSUB, + FMSUBADD, + + // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack, + // according to %al. An operator is needed so that this can be expanded + // with control flow. + VASTART_SAVE_XMM_REGS, + + // WIN_ALLOCA - Windows's _chkstk call to do stack probing. + WIN_ALLOCA, + + // SEG_ALLOCA - For allocating variable amounts of stack space when using + // segmented stacks. Check if the current stacklet has enough space, and + // falls back to heap allocation if not. + SEG_ALLOCA, + + // WIN_FTOL - Windows's _ftol2 runtime routine to do fptoui. + WIN_FTOL, + + // Memory barrier + MEMBARRIER, + MFENCE, + SFENCE, + LFENCE, + + // FNSTSW16r - Store FP status word into i16 register. + FNSTSW16r, + + // SAHF - Store contents of %ah into %eflags. + SAHF, + + // RDRAND - Get a random integer and indicate whether it is valid in CF. + RDRAND, + + // RDSEED - Get a NIST SP800-90B & C compliant random integer and + // indicate whether it is valid in CF. + RDSEED, + + // PCMP*STRI + PCMPISTRI, + PCMPESTRI, + + // XTEST - Test if in transactional execution. + XTEST, + + // ATOMADD64_DAG, ATOMSUB64_DAG, ATOMOR64_DAG, ATOMAND64_DAG, + // ATOMXOR64_DAG, ATOMNAND64_DAG, ATOMSWAP64_DAG - + // Atomic 64-bit binary operations. + ATOMADD64_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, + ATOMSUB64_DAG, + ATOMOR64_DAG, + ATOMXOR64_DAG, + ATOMAND64_DAG, + ATOMNAND64_DAG, + ATOMMAX64_DAG, + ATOMMIN64_DAG, + ATOMUMAX64_DAG, + ATOMUMIN64_DAG, + ATOMSWAP64_DAG, + + // LCMPXCHG_DAG, LCMPXCHG8_DAG, LCMPXCHG16_DAG - Compare and swap. + LCMPXCHG_DAG, + LCMPXCHG8_DAG, + LCMPXCHG16_DAG, + + // VZEXT_LOAD - Load, scalar_to_vector, and zero extend. + VZEXT_LOAD, + + // FNSTCW16m - Store FP control world into i16 memory. + FNSTCW16m, + + /// FP_TO_INT*_IN_MEM - This instruction implements FP_TO_SINT with the + /// integer destination in memory and a FP reg source. This corresponds + /// to the X86::FIST*m instructions and the rounding mode change stuff. It + /// has two inputs (token chain and address) and two outputs (int value + /// and token chain). + FP_TO_INT16_IN_MEM, + FP_TO_INT32_IN_MEM, + FP_TO_INT64_IN_MEM, + + /// FILD, FILD_FLAG - This instruction implements SINT_TO_FP with the + /// integer source in memory and FP reg result. This corresponds to the + /// X86::FILD*m instructions. It has three inputs (token chain, address, + /// and source type) and two outputs (FP value and token chain). FILD_FLAG + /// also produces a flag). + FILD, + FILD_FLAG, + + /// FLD - This instruction implements an extending load to FP stack slots. + /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain + /// operand, ptr to load from, and a ValueType node indicating the type + /// to load to. + FLD, + + /// FST - This instruction implements a truncating store to FP stack + /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a + /// chain operand, value to store, address, and a ValueType to store it + /// as. + FST, + + /// VAARG_64 - This instruction grabs the address of the next argument + /// from a va_list. (reads and modifies the va_list in memory) + VAARG_64 + + // WARNING: Do not add anything in the end unless you want the node to + // have memop! In fact, starting from ATOMADD64_DAG all opcodes will be + // thought as target memory ops! + }; + } + + /// Define some predicates that are used for node matching. + namespace X86 { + /// isVEXTRACTF128Index - Return true if the specified + /// EXTRACT_SUBVECTOR operand specifies a vector extract that is + /// suitable for input to VEXTRACTF128. + bool isVEXTRACTF128Index(SDNode *N); + + /// isVINSERTF128Index - Return true if the specified + /// INSERT_SUBVECTOR operand specifies a subvector insert that is + /// suitable for input to VINSERTF128. + bool isVINSERTF128Index(SDNode *N); + + /// getExtractVEXTRACTF128Immediate - Return the appropriate + /// immediate to extract the specified EXTRACT_SUBVECTOR index + /// with VEXTRACTF128 instructions. + unsigned getExtractVEXTRACTF128Immediate(SDNode *N); + + /// getInsertVINSERTF128Immediate - Return the appropriate + /// immediate to insert at the specified INSERT_SUBVECTOR index + /// with VINSERTF128 instructions. + unsigned getInsertVINSERTF128Immediate(SDNode *N); + + /// isZeroNode - Returns true if Elt is a constant zero or a floating point + /// constant +0.0. + bool isZeroNode(SDValue Elt); + + /// isOffsetSuitableForCodeModel - Returns true of the given offset can be + /// fit into displacement field of the instruction. + bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, + bool hasSymbolicDisplacement = true); + + + /// isCalleePop - Determines whether the callee is required to pop its + /// own arguments. Callee pop is necessary to support tail calls. + bool isCalleePop(CallingConv::ID CallingConv, + bool is64Bit, bool IsVarArg, bool TailCallOpt); + } + + //===--------------------------------------------------------------------===// + // X86TargetLowering - X86 Implementation of the TargetLowering interface + class X86TargetLowering : public TargetLowering { + public: + explicit X86TargetLowering(X86TargetMachine &TM); + + virtual unsigned getJumpTableEncoding() const; + + virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i8; } + + virtual const MCExpr * + LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, + const MachineBasicBlock *MBB, unsigned uid, + MCContext &Ctx) const; + + /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC + /// jumptable. + virtual SDValue getPICJumpTableRelocBase(SDValue Table, + SelectionDAG &DAG) const; + virtual const MCExpr * + getPICJumpTableRelocBaseExpr(const MachineFunction *MF, + unsigned JTI, MCContext &Ctx) const; + + /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate + /// function arguments in the caller parameter area. For X86, aggregates + /// that contains are placed at 16-byte boundaries while the rest are at + /// 4-byte boundaries. + virtual unsigned getByValTypeAlignment(Type *Ty) const; + + /// getOptimalMemOpType - Returns the target specific optimal type for load + /// and store operations as a result of memset, memcpy, and memmove + /// lowering. If DstAlign is zero that means it's safe to destination + /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it + /// means there isn't a need to check it against alignment requirement, + /// probably because the source does not need to be loaded. If 'IsMemset' is + /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that + /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy + /// source is constant so it does not need to be loaded. + /// It returns EVT::Other if the type should be determined using generic + /// target-independent logic. + virtual EVT + getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, + bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, + MachineFunction &MF) const; + + /// isSafeMemOpType - Returns true if it's safe to use load / store of the + /// specified type to expand memcpy / memset inline. This is mostly true + /// for all types except for some special cases. For example, on X86 + /// targets without SSE2 f64 load / store are done with fldl / fstpl which + /// also does type conversion. Note the specified type doesn't have to be + /// legal as the hook is used before type legalization. + virtual bool isSafeMemOpType(MVT VT) const; + + /// allowsUnalignedMemoryAccesses - Returns true if the target allows + /// unaligned memory accesses. of the specified type. Returns whether it + /// is "fast" by reference in the second argument. + virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const; + + /// LowerOperation - Provide custom lowering hooks for some operations. + /// + virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; + + /// ReplaceNodeResults - Replace the results of node with an illegal result + /// type with new values built out of custom code. + /// + virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results, + SelectionDAG &DAG) const; + + + virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; + + /// isTypeDesirableForOp - Return true if the target has native support for + /// the specified value type and it is 'desirable' to use the type for the + /// given node type. e.g. On x86 i16 is legal, but undesirable since i16 + /// instruction encodings are longer and some i16 instructions are slow. + virtual bool isTypeDesirableForOp(unsigned Opc, EVT VT) const; + + /// isTypeDesirable - Return true if the target has native support for the + /// specified value type and it is 'desirable' to use the type. e.g. On x86 + /// i16 is legal, but undesirable since i16 instruction encodings are longer + /// and some i16 instructions are slow. + virtual bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const; + + virtual MachineBasicBlock * + EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock *MBB) const; + + + /// getTargetNodeName - This method returns the name of a target specific + /// DAG node. + virtual const char *getTargetNodeName(unsigned Opcode) const; + + /// getSetCCResultType - Return the value type to use for ISD::SETCC. + virtual EVT getSetCCResultType(EVT VT) const; + + /// computeMaskedBitsForTargetNode - Determine which of the bits specified + /// in Mask are known to be either zero or one and return them in the + /// KnownZero/KnownOne bitsets. + virtual void computeMaskedBitsForTargetNode(const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth = 0) const; + + // ComputeNumSignBitsForTargetNode - Determine the number of bits in the + // operation that are sign bits. + virtual unsigned ComputeNumSignBitsForTargetNode(SDValue Op, + unsigned Depth) const; + + virtual bool + isGAPlusOffset(SDNode *N, const GlobalValue* &GA, int64_t &Offset) const; + + SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const; + + virtual bool ExpandInlineAsm(CallInst *CI) const; + + ConstraintType getConstraintType(const std::string &Constraint) const; + + /// Examine constraint string and operand type and determine a weight value. + /// The operand object must already have been set up with the operand type. + virtual ConstraintWeight getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const; + + virtual const char *LowerXConstraint(EVT ConstraintVT) const; + + /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops + /// vector. If it is invalid, don't add anything to Ops. If hasMemory is + /// true it means one of the asm constraint of the inline asm instruction + /// being processed is 'm'. + virtual void LowerAsmOperandForConstraint(SDValue Op, + std::string &Constraint, + std::vector<SDValue> &Ops, + SelectionDAG &DAG) const; + + /// getRegForInlineAsmConstraint - Given a physical register constraint + /// (e.g. {edx}), return the register number and the register class for the + /// register. This should only be used for C_Register constraints. On + /// error, this returns a register number of 0. + std::pair<unsigned, const TargetRegisterClass*> + getRegForInlineAsmConstraint(const std::string &Constraint, + EVT VT) const; + + /// isLegalAddressingMode - Return true if the addressing mode represented + /// by AM is legal for this target, for a load/store of the specified type. + virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const; + + /// isLegalICmpImmediate - Return true if the specified immediate is legal + /// icmp immediate, that is the target has icmp instructions which can + /// compare a register against the immediate without having to materialize + /// the immediate into a register. + virtual bool isLegalICmpImmediate(int64_t Imm) const; + + /// isLegalAddImmediate - Return true if the specified immediate is legal + /// add immediate, that is the target has add instructions which can + /// add a register and the immediate without having to materialize + /// the immediate into a register. + virtual bool isLegalAddImmediate(int64_t Imm) const; + + /// isTruncateFree - Return true if it's free to truncate a value of + /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in + /// register EAX to i16 by referencing its sub-register AX. + virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const; + virtual bool isTruncateFree(EVT VT1, EVT VT2) const; + + /// isZExtFree - Return true if any actual instruction that defines a + /// value of type Ty1 implicit zero-extends the value to Ty2 in the result + /// register. This does not necessarily include registers defined in + /// unknown ways, such as incoming arguments, or copies from unknown + /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this + /// does not necessarily apply to truncate instructions. e.g. on x86-64, + /// all instructions that define 32-bit values implicit zero-extend the + /// result out to 64 bits. + virtual bool isZExtFree(Type *Ty1, Type *Ty2) const; + virtual bool isZExtFree(EVT VT1, EVT VT2) const; + virtual bool isZExtFree(SDValue Val, EVT VT2) const; + + /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than + /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to + /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd + /// is expanded to mul + add. + virtual bool isFMAFasterThanMulAndAdd(EVT) const { return true; } + + /// isNarrowingProfitable - Return true if it's profitable to narrow + /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow + /// from i32 to i8 but not from i32 to i16. + virtual bool isNarrowingProfitable(EVT VT1, EVT VT2) const; + + /// isFPImmLegal - Returns true if the target can instruction select the + /// specified FP immediate natively. If false, the legalizer will + /// materialize the FP immediate as a load from a constant pool. + virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const; + + /// isShuffleMaskLegal - Targets can use this to indicate that they only + /// support *some* VECTOR_SHUFFLE operations, those with specific masks. + /// By default, if a target supports the VECTOR_SHUFFLE node, all mask + /// values are assumed to be legal. + virtual bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask, + EVT VT) const; + + /// isVectorClearMaskLegal - Similar to isShuffleMaskLegal. This is + /// used by Targets can use this to indicate if there is a suitable + /// VECTOR_SHUFFLE that can be used to replace a VAND with a constant + /// pool entry. + virtual bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, + EVT VT) const; + + /// ShouldShrinkFPConstant - If true, then instruction selection should + /// seek to shrink the FP constant of the specified type to a smaller type + /// in order to save space and / or reduce runtime. + virtual bool ShouldShrinkFPConstant(EVT VT) const { + // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more + // expensive than a straight movsd. On the other hand, it's important to + // shrink long double fp constant since fldt is very slow. + return !X86ScalarSSEf64 || VT == MVT::f80; + } + + const X86Subtarget* getSubtarget() const { + return Subtarget; + } + + /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is + /// computed in an SSE register, not on the X87 floating point stack. + bool isScalarFPTypeInSSEReg(EVT VT) const { + return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 + (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 + } + + /// isTargetFTOL - Return true if the target uses the MSVC _ftol2 routine + /// for fptoui. + bool isTargetFTOL() const { + return Subtarget->isTargetWindows() && !Subtarget->is64Bit(); + } + + /// isIntegerTypeFTOL - Return true if the MSVC _ftol2 routine should be + /// used for fptoui to the given type. + bool isIntegerTypeFTOL(EVT VT) const { + return isTargetFTOL() && VT == MVT::i64; + } + + /// createFastISel - This method returns a target specific FastISel object, + /// or null if the target does not support "fast" ISel. + virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) const; + + /// getStackCookieLocation - Return true if the target stores stack + /// protector cookies at a fixed offset in some non-standard address + /// space, and populates the address space and offset as + /// appropriate. + virtual bool getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const; + + SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, + SelectionDAG &DAG) const; + + /// \brief Reset the operation actions based on target options. + virtual void resetOperationActions(); + + protected: + std::pair<const TargetRegisterClass*, uint8_t> + findRepresentativeClass(MVT VT) const; + + private: + /// Subtarget - Keep a pointer to the X86Subtarget around so that we can + /// make the right decision when generating code for different targets. + const X86Subtarget *Subtarget; + const X86RegisterInfo *RegInfo; + const DataLayout *TD; + + /// Used to store the TargetOptions so that we don't waste time resetting + /// the operation actions unless we have to. + TargetOptions TO; + + /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 + /// floating point ops. + /// When SSE is available, use it for f32 operations. + /// When SSE2 is available, use it for f64 operations. + bool X86ScalarSSEf32; + bool X86ScalarSSEf64; + + /// LegalFPImmediates - A list of legal fp immediates. + std::vector<APFloat> LegalFPImmediates; + + /// addLegalFPImmediate - Indicate that this x86 target can instruction + /// select the specified FP immediate natively. + void addLegalFPImmediate(const APFloat& Imm) { + LegalFPImmediates.push_back(Imm); + } + + SDValue LowerCallResult(SDValue Chain, SDValue InFlag, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + SDValue LowerMemArgument(SDValue Chain, + CallingConv::ID CallConv, + const SmallVectorImpl<ISD::InputArg> &ArgInfo, + DebugLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, MachineFrameInfo *MFI, + unsigned i) const; + SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, + DebugLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, + ISD::ArgFlagsTy Flags) const; + + // Call lowering helpers. + + /// IsEligibleForTailCallOptimization - Check whether the call is eligible + /// for tail call optimization. Targets which want to do tail call + /// optimization should implement this function. + bool IsEligibleForTailCallOptimization(SDValue Callee, + CallingConv::ID CalleeCC, + bool isVarArg, + bool isCalleeStructRet, + bool isCallerStructRet, + Type *RetTy, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + SelectionDAG& DAG) const; + bool IsCalleePop(bool isVarArg, CallingConv::ID CallConv) const; + SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr, + SDValue Chain, bool IsTailCall, bool Is64Bit, + int FPDiff, DebugLoc dl) const; + + unsigned GetAlignedArgumentStackSize(unsigned StackSize, + SelectionDAG &DAG) const; + + std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, + bool isSigned, + bool isReplace) const; + + SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, + SelectionDAG &DAG) const; + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, + int64_t Offset, SelectionDAG &DAG) const; + SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBITCAST(SDValue op, SelectionDAG &DAG) const; + SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerANY_EXTEND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerToBT(SDValue And, ISD::CondCode CC, + DebugLoc dl, SelectionDAG &DAG) const; + SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerMEMSET(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; + + // Utility functions to help LowerVECTOR_SHUFFLE & LowerBUILD_VECTOR + SDValue LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const; + SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const; + SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const; + + virtual SDValue + LowerFormalArguments(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + virtual SDValue + LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const; + + virtual SDValue + LowerReturn(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + DebugLoc dl, SelectionDAG &DAG) const; + + virtual bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const; + + virtual bool mayBeEmittedAsTailCall(CallInst *CI) const; + + virtual MVT + getTypeForExtArgOrReturn(MVT VT, ISD::NodeType ExtendKind) const; + + virtual bool + CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const; + + /// Utility function to emit atomic-load-arith operations (and, or, xor, + /// nand, max, min, umax, umin). It takes the corresponding instruction to + /// expand, the associated machine basic block, and the associated X86 + /// opcodes for reg/reg. + MachineBasicBlock *EmitAtomicLoadArith(MachineInstr *MI, + MachineBasicBlock *MBB) const; + + /// Utility function to emit atomic-load-arith operations (and, or, xor, + /// nand, add, sub, swap) for 64-bit operands on 32-bit target. + MachineBasicBlock *EmitAtomicLoadArith6432(MachineInstr *MI, + MachineBasicBlock *MBB) const; + + // Utility function to emit the low-level va_arg code for X86-64. + MachineBasicBlock *EmitVAARG64WithCustomInserter( + MachineInstr *MI, + MachineBasicBlock *MBB) const; + + /// Utility function to emit the xmm reg save portion of va_start. + MachineBasicBlock *EmitVAStartSaveXMMRegsWithCustomInserter( + MachineInstr *BInstr, + MachineBasicBlock *BB) const; + + MachineBasicBlock *EmitLoweredSelect(MachineInstr *I, + MachineBasicBlock *BB) const; + + MachineBasicBlock *EmitLoweredWinAlloca(MachineInstr *MI, + MachineBasicBlock *BB) const; + + MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr *MI, + MachineBasicBlock *BB, + bool Is64Bit) const; + + MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI, + MachineBasicBlock *BB) const; + + MachineBasicBlock *emitLoweredTLSAddr(MachineInstr *MI, + MachineBasicBlock *BB) const; + + MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const; + + MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const; + + /// Emit nodes that will be selected as "test Op0,Op0", or something + /// equivalent, for use with the given x86 condition code. + SDValue EmitTest(SDValue Op0, unsigned X86CC, SelectionDAG &DAG) const; + + /// Emit nodes that will be selected as "cmp Op0,Op1", or something + /// equivalent, for use with the given x86 condition code. + SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, + SelectionDAG &DAG) const; + + /// Convert a comparison if required by the subtarget. + SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const; + }; + + namespace X86 { + FastISel *createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo); + } +} + +#endif // X86ISELLOWERING_H diff --git a/contrib/llvm/lib/Target/X86/X86Instr3DNow.td b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td new file mode 100644 index 0000000..ba1aede --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td @@ -0,0 +1,103 @@ +//===-- X86Instr3DNow.td - The 3DNow! Instruction Set ------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the 3DNow! instruction set, which extends MMX to support +// floating point and also adds a few more random instructions for good measure. +// +//===----------------------------------------------------------------------===// + +class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat> + : I<o, F, outs, ins, asm, pat>, TB, Requires<[Has3DNow]> { +} + +class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat> + : I3DNow<o, F, (outs VR64:$dst), ins, + !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat>, + Has3DNow0F0FOpcode { + // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet. + let isAsmParserOnly = 1; + let Constraints = "$src1 = $dst"; +} + +class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat> + : I3DNow<o, F, (outs VR64:$dst), ins, + !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat>, + Has3DNow0F0FOpcode { + // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet. + let isAsmParserOnly = 1; +} + +multiclass I3DNow_binop_rm<bits<8> opc, string Mn> { + def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, []>; + def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, []>; +} + +multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, string Ver = ""> { + def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>; + def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, + (bitconvert (load_mmx addr:$src2))))]>; +} + +multiclass I3DNow_conv_rm<bits<8> opc, string Mn> { + def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src1), Mn, []>; + def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src1), Mn, []>; +} + +multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, string Ver = ""> { + def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>; + def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) + (bitconvert (load_mmx addr:$src))))]>; +} + +defm PAVGUSB : I3DNow_binop_rm_int<0xBF, "pavgusb">; +defm PF2ID : I3DNow_conv_rm_int<0x1D, "pf2id">; +defm PFACC : I3DNow_binop_rm_int<0xAE, "pfacc">; +defm PFADD : I3DNow_binop_rm_int<0x9E, "pfadd">; +defm PFCMPEQ : I3DNow_binop_rm_int<0xB0, "pfcmpeq">; +defm PFCMPGE : I3DNow_binop_rm_int<0x90, "pfcmpge">; +defm PFCMPGT : I3DNow_binop_rm_int<0xA0, "pfcmpgt">; +defm PFMAX : I3DNow_binop_rm_int<0xA4, "pfmax">; +defm PFMIN : I3DNow_binop_rm_int<0x94, "pfmin">; +defm PFMUL : I3DNow_binop_rm_int<0xB4, "pfmul">; +defm PFRCP : I3DNow_conv_rm_int<0x96, "pfrcp">; +defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1">; +defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2">; +defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1">; +defm PFRSQRT : I3DNow_conv_rm_int<0x97, "pfrsqrt">; +defm PFSUB : I3DNow_binop_rm_int<0x9A, "pfsub">; +defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr">; +defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd">; +defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw">; + + +def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms", + [(int_x86_mmx_femms)]>; + +def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr), + "prefetch\t$addr", + [(prefetch addr:$addr, (i32 0), imm, (i32 1))]>; + +def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr", + [(prefetch addr:$addr, (i32 1), (i32 3), (i32 1))]>, TB, + Requires<[HasPrefetchW]>; + +// "3DNowA" instructions +defm PF2IW : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">; +defm PI2FW : I3DNow_conv_rm_int<0x0C, "pi2fw", "a">; +defm PFNACC : I3DNow_binop_rm_int<0x8A, "pfnacc", "a">; +defm PFPNACC : I3DNow_binop_rm_int<0x8E, "pfpnacc", "a">; +defm PSWAPD : I3DNow_conv_rm_int<0xBB, "pswapd", "a">; diff --git a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td new file mode 100644 index 0000000..225e972 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -0,0 +1,1361 @@ +//===-- X86InstrArithmetic.td - Integer Arithmetic Instrs --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the integer arithmetic instructions in the X86 +// architecture. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// LEA - Load Effective Address +let SchedRW = [WriteLEA] in { +let neverHasSideEffects = 1 in +def LEA16r : I<0x8D, MRMSrcMem, + (outs GR16:$dst), (ins i32mem:$src), + "lea{w}\t{$src|$dst}, {$dst|$src}", [], IIC_LEA_16>, OpSize; +let isReMaterializable = 1 in +def LEA32r : I<0x8D, MRMSrcMem, + (outs GR32:$dst), (ins i32mem:$src), + "lea{l}\t{$src|$dst}, {$dst|$src}", + [(set GR32:$dst, lea32addr:$src)], IIC_LEA>, + Requires<[In32BitMode]>; + +def LEA64_32r : I<0x8D, MRMSrcMem, + (outs GR32:$dst), (ins lea64_32mem:$src), + "lea{l}\t{$src|$dst}, {$dst|$src}", + [(set GR32:$dst, lea64_32addr:$src)], IIC_LEA>, + Requires<[In64BitMode]>; + +let isReMaterializable = 1 in +def LEA64r : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src), + "lea{q}\t{$src|$dst}, {$dst|$src}", + [(set GR64:$dst, lea64addr:$src)], IIC_LEA>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Fixed-Register Multiplication and Division Instructions. +// + +// SchedModel info for instruction that loads one value and gets the second +// (and possibly third) value from a register. +// This is used for instructions that put the memory operands before other +// uses. +class SchedLoadReg<SchedWrite SW> : Sched<[SW, + // Memory operand. + ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, + // Register reads (implicit or explicit). + ReadAfterLd, ReadAfterLd]>; + +// Extra precision multiplication + +// AL is really implied by AX, but the registers in Defs must match the +// SDNode results (i8, i32). +// AL,AH = AL*GR8 +let Defs = [AL,EFLAGS,AX], Uses = [AL] in +def MUL8r : I<0xF6, MRM4r, (outs), (ins GR8:$src), "mul{b}\t$src", + // FIXME: Used for 8-bit mul, ignore result upper 8 bits. + // This probably ought to be moved to a def : Pat<> if the + // syntax can be accepted. + [(set AL, (mul AL, GR8:$src)), + (implicit EFLAGS)], IIC_MUL8>, Sched<[WriteIMul]>; +// AX,DX = AX*GR16 +let Defs = [AX,DX,EFLAGS], Uses = [AX], neverHasSideEffects = 1 in +def MUL16r : I<0xF7, MRM4r, (outs), (ins GR16:$src), + "mul{w}\t$src", + [], IIC_MUL16_REG>, OpSize, Sched<[WriteIMul]>; +// EAX,EDX = EAX*GR32 +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], neverHasSideEffects = 1 in +def MUL32r : I<0xF7, MRM4r, (outs), (ins GR32:$src), + "mul{l}\t$src", + [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/], + IIC_MUL32_REG>, Sched<[WriteIMul]>; +// RAX,RDX = RAX*GR64 +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], neverHasSideEffects = 1 in +def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src), + "mul{q}\t$src", + [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/], + IIC_MUL64>, Sched<[WriteIMul]>; +// AL,AH = AL*[mem8] +let Defs = [AL,EFLAGS,AX], Uses = [AL] in +def MUL8m : I<0xF6, MRM4m, (outs), (ins i8mem :$src), + "mul{b}\t$src", + // FIXME: Used for 8-bit mul, ignore result upper 8 bits. + // This probably ought to be moved to a def : Pat<> if the + // syntax can be accepted. + [(set AL, (mul AL, (loadi8 addr:$src))), + (implicit EFLAGS)], IIC_MUL8>, SchedLoadReg<WriteIMulLd>; +// AX,DX = AX*[mem16] +let mayLoad = 1, neverHasSideEffects = 1 in { +let Defs = [AX,DX,EFLAGS], Uses = [AX] in +def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src), + "mul{w}\t$src", + [], IIC_MUL16_MEM>, OpSize, SchedLoadReg<WriteIMulLd>; +// EAX,EDX = EAX*[mem32] +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in +def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src), + "mul{l}\t$src", + [], IIC_MUL32_MEM>, SchedLoadReg<WriteIMulLd>; +// RAX,RDX = RAX*[mem64] +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in +def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src), + "mul{q}\t$src", [], IIC_MUL64>, SchedLoadReg<WriteIMulLd>; +} + +let neverHasSideEffects = 1 in { +// AL,AH = AL*GR8 +let Defs = [AL,EFLAGS,AX], Uses = [AL] in +def IMUL8r : I<0xF6, MRM5r, (outs), (ins GR8:$src), "imul{b}\t$src", [], + IIC_IMUL8>, Sched<[WriteIMul]>; +// AX,DX = AX*GR16 +let Defs = [AX,DX,EFLAGS], Uses = [AX] in +def IMUL16r : I<0xF7, MRM5r, (outs), (ins GR16:$src), "imul{w}\t$src", [], + IIC_IMUL16_RR>, OpSize, Sched<[WriteIMul]>; +// EAX,EDX = EAX*GR32 +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in +def IMUL32r : I<0xF7, MRM5r, (outs), (ins GR32:$src), "imul{l}\t$src", [], + IIC_IMUL32_RR>, Sched<[WriteIMul]>; +// RAX,RDX = RAX*GR64 +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in +def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src), "imul{q}\t$src", [], + IIC_IMUL64_RR>, Sched<[WriteIMul]>; + +let mayLoad = 1 in { +// AL,AH = AL*[mem8] +let Defs = [AL,EFLAGS,AX], Uses = [AL] in +def IMUL8m : I<0xF6, MRM5m, (outs), (ins i8mem :$src), + "imul{b}\t$src", [], IIC_IMUL8>, SchedLoadReg<WriteIMulLd>; +// AX,DX = AX*[mem16] +let Defs = [AX,DX,EFLAGS], Uses = [AX] in +def IMUL16m : I<0xF7, MRM5m, (outs), (ins i16mem:$src), + "imul{w}\t$src", [], IIC_IMUL16_MEM>, OpSize, + SchedLoadReg<WriteIMulLd>; +// EAX,EDX = EAX*[mem32] +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in +def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src), + "imul{l}\t$src", [], IIC_IMUL32_MEM>, SchedLoadReg<WriteIMulLd>; +// RAX,RDX = RAX*[mem64] +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in +def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src), + "imul{q}\t$src", [], IIC_IMUL64>, SchedLoadReg<WriteIMulLd>; +} +} // neverHasSideEffects + + +let Defs = [EFLAGS] in { +let Constraints = "$src1 = $dst" in { + +let isCommutable = 1, SchedRW = [WriteIMul] in { +// X = IMUL Y, Z --> X = IMUL Z, Y +// Register-Register Signed Integer Multiply +def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2), + "imul{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag GR16:$src1, GR16:$src2))], IIC_IMUL16_RR>, + TB, OpSize; +def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2), + "imul{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag GR32:$src1, GR32:$src2))], IIC_IMUL32_RR>, + TB; +def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2), + "imul{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag GR64:$src1, GR64:$src2))], IIC_IMUL64_RR>, + TB; +} // isCommutable, SchedRW + +// Register-Memory Signed Integer Multiply +let SchedRW = [WriteIMulLd, ReadAfterLd] in { +def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst), + (ins GR16:$src1, i16mem:$src2), + "imul{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag GR16:$src1, (load addr:$src2)))], + IIC_IMUL16_RM>, + TB, OpSize; +def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$src1, i32mem:$src2), + "imul{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag GR32:$src1, (load addr:$src2)))], + IIC_IMUL32_RM>, + TB; +def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$src1, i64mem:$src2), + "imul{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag GR64:$src1, (load addr:$src2)))], + IIC_IMUL64_RM>, + TB; +} // SchedRW +} // Constraints = "$src1 = $dst" + +} // Defs = [EFLAGS] + +// Surprisingly enough, these are not two address instructions! +let Defs = [EFLAGS] in { +let SchedRW = [WriteIMul] in { +// Register-Integer Signed Integer Multiply +def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16 + (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), + "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag GR16:$src1, imm:$src2))], + IIC_IMUL16_RRI>, OpSize; +def IMUL16rri8 : Ii8<0x6B, MRMSrcReg, // GR16 = GR16*I8 + (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), + "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag GR16:$src1, i16immSExt8:$src2))], + IIC_IMUL16_RRI>, + OpSize; +def IMUL32rri : Ii32<0x69, MRMSrcReg, // GR32 = GR32*I32 + (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), + "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag GR32:$src1, imm:$src2))], + IIC_IMUL32_RRI>; +def IMUL32rri8 : Ii8<0x6B, MRMSrcReg, // GR32 = GR32*I8 + (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), + "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag GR32:$src1, i32immSExt8:$src2))], + IIC_IMUL32_RRI>; +def IMUL64rri32 : RIi32<0x69, MRMSrcReg, // GR64 = GR64*I32 + (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), + "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag GR64:$src1, i64immSExt32:$src2))], + IIC_IMUL64_RRI>; +def IMUL64rri8 : RIi8<0x6B, MRMSrcReg, // GR64 = GR64*I8 + (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), + "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag GR64:$src1, i64immSExt8:$src2))], + IIC_IMUL64_RRI>; +} // SchedRW + +// Memory-Integer Signed Integer Multiply +let SchedRW = [WriteIMulLd] in { +def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16 + (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2), + "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), imm:$src2))], + IIC_IMUL16_RMI>, + OpSize; +def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8 + (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2), + "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), + i16immSExt8:$src2))], IIC_IMUL16_RMI>, + OpSize; +def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32 + (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2), + "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), imm:$src2))], + IIC_IMUL32_RMI>; +def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8 + (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2), + "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), + i32immSExt8:$src2))], + IIC_IMUL32_RMI>; +def IMUL64rmi32 : RIi32<0x69, MRMSrcMem, // GR64 = [mem64]*I32 + (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2), + "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), + i64immSExt32:$src2))], + IIC_IMUL64_RMI>; +def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8 + (outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2), + "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), + i64immSExt8:$src2))], + IIC_IMUL64_RMI>; +} // SchedRW +} // Defs = [EFLAGS] + + + + +// unsigned division/remainder +let hasSideEffects = 1 in { // so that we don't speculatively execute +let SchedRW = [WriteIDiv] in { +let Defs = [AL,EFLAGS,AX], Uses = [AX] in +def DIV8r : I<0xF6, MRM6r, (outs), (ins GR8:$src), // AX/r8 = AL,AH + "div{b}\t$src", [], IIC_DIV8_REG>; +let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in +def DIV16r : I<0xF7, MRM6r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX + "div{w}\t$src", [], IIC_DIV16>, OpSize; +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in +def DIV32r : I<0xF7, MRM6r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX + "div{l}\t$src", [], IIC_DIV32>; +// RDX:RAX/r64 = RAX,RDX +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in +def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src), + "div{q}\t$src", [], IIC_DIV64>; +} // SchedRW + +let mayLoad = 1 in { +let Defs = [AL,EFLAGS,AX], Uses = [AX] in +def DIV8m : I<0xF6, MRM6m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH + "div{b}\t$src", [], IIC_DIV8_MEM>, + SchedLoadReg<WriteIDivLd>; +let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in +def DIV16m : I<0xF7, MRM6m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX + "div{w}\t$src", [], IIC_DIV16>, OpSize, + SchedLoadReg<WriteIDivLd>; +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX +def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src), + "div{l}\t$src", [], IIC_DIV32>, + SchedLoadReg<WriteIDivLd>; +// RDX:RAX/[mem64] = RAX,RDX +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in +def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src), + "div{q}\t$src", [], IIC_DIV64>, + SchedLoadReg<WriteIDivLd>; +} + +// Signed division/remainder. +let SchedRW = [WriteIDiv] in { +let Defs = [AL,EFLAGS,AX], Uses = [AX] in +def IDIV8r : I<0xF6, MRM7r, (outs), (ins GR8:$src), // AX/r8 = AL,AH + "idiv{b}\t$src", [], IIC_IDIV8>; +let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in +def IDIV16r: I<0xF7, MRM7r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX + "idiv{w}\t$src", [], IIC_IDIV16>, OpSize; +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in +def IDIV32r: I<0xF7, MRM7r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX + "idiv{l}\t$src", [], IIC_IDIV32>; +// RDX:RAX/r64 = RAX,RDX +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in +def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src), + "idiv{q}\t$src", [], IIC_IDIV64>; +} // SchedRW + +let mayLoad = 1 in { +let Defs = [AL,EFLAGS,AX], Uses = [AX] in +def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH + "idiv{b}\t$src", [], IIC_IDIV8>, + SchedLoadReg<WriteIDivLd>; +let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in +def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX + "idiv{w}\t$src", [], IIC_IDIV16>, OpSize, + SchedLoadReg<WriteIDivLd>; +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX +def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src), + "idiv{l}\t$src", [], IIC_IDIV32>, + SchedLoadReg<WriteIDivLd>; +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX +def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src), + "idiv{q}\t$src", [], IIC_IDIV64>, + SchedLoadReg<WriteIDivLd>; +} +} // hasSideEffects = 0 + +//===----------------------------------------------------------------------===// +// Two address Instructions. +// + +// unary instructions +let CodeSize = 2 in { +let Defs = [EFLAGS] in { +let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in { +def NEG8r : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src1), + "neg{b}\t$dst", + [(set GR8:$dst, (ineg GR8:$src1)), + (implicit EFLAGS)], IIC_UNARY_REG>; +def NEG16r : I<0xF7, MRM3r, (outs GR16:$dst), (ins GR16:$src1), + "neg{w}\t$dst", + [(set GR16:$dst, (ineg GR16:$src1)), + (implicit EFLAGS)], IIC_UNARY_REG>, OpSize; +def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src1), + "neg{l}\t$dst", + [(set GR32:$dst, (ineg GR32:$src1)), + (implicit EFLAGS)], IIC_UNARY_REG>; +def NEG64r : RI<0xF7, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "neg{q}\t$dst", + [(set GR64:$dst, (ineg GR64:$src1)), + (implicit EFLAGS)], IIC_UNARY_REG>; +} // Constraints = "$src1 = $dst", SchedRW + +// Read-modify-write negate. +let SchedRW = [WriteALULd, WriteRMW] in { +def NEG8m : I<0xF6, MRM3m, (outs), (ins i8mem :$dst), + "neg{b}\t$dst", + [(store (ineg (loadi8 addr:$dst)), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>; +def NEG16m : I<0xF7, MRM3m, (outs), (ins i16mem:$dst), + "neg{w}\t$dst", + [(store (ineg (loadi16 addr:$dst)), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize; +def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst), + "neg{l}\t$dst", + [(store (ineg (loadi32 addr:$dst)), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>; +def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst", + [(store (ineg (loadi64 addr:$dst)), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>; +} // SchedRW +} // Defs = [EFLAGS] + + +// Note: NOT does not set EFLAGS! + +let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in { +// Match xor -1 to not. Favors these over a move imm + xor to save code size. +let AddedComplexity = 15 in { +def NOT8r : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src1), + "not{b}\t$dst", + [(set GR8:$dst, (not GR8:$src1))], IIC_UNARY_REG>; +def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src1), + "not{w}\t$dst", + [(set GR16:$dst, (not GR16:$src1))], IIC_UNARY_REG>, OpSize; +def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src1), + "not{l}\t$dst", + [(set GR32:$dst, (not GR32:$src1))], IIC_UNARY_REG>; +def NOT64r : RI<0xF7, MRM2r, (outs GR64:$dst), (ins GR64:$src1), "not{q}\t$dst", + [(set GR64:$dst, (not GR64:$src1))], IIC_UNARY_REG>; +} +} // Constraints = "$src1 = $dst", SchedRW + +let SchedRW = [WriteALULd, WriteRMW] in { +def NOT8m : I<0xF6, MRM2m, (outs), (ins i8mem :$dst), + "not{b}\t$dst", + [(store (not (loadi8 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>; +def NOT16m : I<0xF7, MRM2m, (outs), (ins i16mem:$dst), + "not{w}\t$dst", + [(store (not (loadi16 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>, + OpSize; +def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst), + "not{l}\t$dst", + [(store (not (loadi32 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>; +def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst", + [(store (not (loadi64 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>; +} // SchedRW +} // CodeSize + +// TODO: inc/dec is slow for P4, but fast for Pentium-M. +let Defs = [EFLAGS] in { +let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in { +let CodeSize = 2 in +def INC8r : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), + "inc{b}\t$dst", + [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src1))], + IIC_UNARY_REG>; + +let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA. +def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), + "inc{w}\t$dst", + [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))], IIC_UNARY_REG>, + OpSize, Requires<[In32BitMode]>; +def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), + "inc{l}\t$dst", + [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))], + IIC_UNARY_REG>, + Requires<[In32BitMode]>; +def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "inc{q}\t$dst", + [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src1))], + IIC_UNARY_REG>; +} // isConvertibleToThreeAddress = 1, CodeSize = 1 + + +// In 64-bit mode, single byte INC and DEC cannot be encoded. +let isConvertibleToThreeAddress = 1, CodeSize = 2 in { +// Can transform into LEA. +def INC64_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1), + "inc{w}\t$dst", + [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))], + IIC_UNARY_REG>, + OpSize, Requires<[In64BitMode]>; +def INC64_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1), + "inc{l}\t$dst", + [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))], + IIC_UNARY_REG>, + Requires<[In64BitMode]>; +def DEC64_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1), + "dec{w}\t$dst", + [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))], + IIC_UNARY_REG>, + OpSize, Requires<[In64BitMode]>; +def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1), + "dec{l}\t$dst", + [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))], + IIC_UNARY_REG>, + Requires<[In64BitMode]>; +} // isConvertibleToThreeAddress = 1, CodeSize = 2 + +} // Constraints = "$src1 = $dst", SchedRW + +let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in { + def INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst", + [(store (add (loadi8 addr:$dst), 1), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>; + def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst", + [(store (add (loadi16 addr:$dst), 1), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>, + OpSize, Requires<[In32BitMode]>; + def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst", + [(store (add (loadi32 addr:$dst), 1), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>, + Requires<[In32BitMode]>; + def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst", + [(store (add (loadi64 addr:$dst), 1), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>; + +// These are duplicates of their 32-bit counterparts. Only needed so X86 knows +// how to unfold them. +// FIXME: What is this for?? +def INC64_16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst", + [(store (add (loadi16 addr:$dst), 1), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>, + OpSize, Requires<[In64BitMode]>; +def INC64_32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst", + [(store (add (loadi32 addr:$dst), 1), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>, + Requires<[In64BitMode]>; +def DEC64_16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst", + [(store (add (loadi16 addr:$dst), -1), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>, + OpSize, Requires<[In64BitMode]>; +def DEC64_32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst", + [(store (add (loadi32 addr:$dst), -1), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>, + Requires<[In64BitMode]>; +} // CodeSize = 2, SchedRW + +let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in { +let CodeSize = 2 in +def DEC8r : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), + "dec{b}\t$dst", + [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))], + IIC_UNARY_REG>; +let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA. +def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), + "dec{w}\t$dst", + [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))], + IIC_UNARY_REG>, + OpSize, Requires<[In32BitMode]>; +def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), + "dec{l}\t$dst", + [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))], + IIC_UNARY_REG>, + Requires<[In32BitMode]>; +def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "dec{q}\t$dst", + [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))], + IIC_UNARY_REG>; +} // CodeSize = 2 +} // Constraints = "$src1 = $dst", SchedRW + + +let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in { + def DEC8m : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst", + [(store (add (loadi8 addr:$dst), -1), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>; + def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst", + [(store (add (loadi16 addr:$dst), -1), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>, + OpSize, Requires<[In32BitMode]>; + def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst", + [(store (add (loadi32 addr:$dst), -1), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>, + Requires<[In32BitMode]>; + def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", + [(store (add (loadi64 addr:$dst), -1), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>; +} // CodeSize = 2, SchedRW +} // Defs = [EFLAGS] + + +/// X86TypeInfo - This is a bunch of information that describes relevant X86 +/// information about value types. For example, it can tell you what the +/// register class and preferred load to use. +class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass, + PatFrag loadnode, X86MemOperand memoperand, ImmType immkind, + Operand immoperand, SDPatternOperator immoperator, + Operand imm8operand, SDPatternOperator imm8operator, + bit hasOddOpcode, bit hasOpSizePrefix, bit hasREX_WPrefix> { + /// VT - This is the value type itself. + ValueType VT = vt; + + /// InstrSuffix - This is the suffix used on instructions with this type. For + /// example, i8 -> "b", i16 -> "w", i32 -> "l", i64 -> "q". + string InstrSuffix = instrsuffix; + + /// RegClass - This is the register class associated with this type. For + /// example, i8 -> GR8, i16 -> GR16, i32 -> GR32, i64 -> GR64. + RegisterClass RegClass = regclass; + + /// LoadNode - This is the load node associated with this type. For + /// example, i8 -> loadi8, i16 -> loadi16, i32 -> loadi32, i64 -> loadi64. + PatFrag LoadNode = loadnode; + + /// MemOperand - This is the memory operand associated with this type. For + /// example, i8 -> i8mem, i16 -> i16mem, i32 -> i32mem, i64 -> i64mem. + X86MemOperand MemOperand = memoperand; + + /// ImmEncoding - This is the encoding of an immediate of this type. For + /// example, i8 -> Imm8, i16 -> Imm16, i32 -> Imm32. Note that i64 -> Imm32 + /// since the immediate fields of i64 instructions is a 32-bit sign extended + /// value. + ImmType ImmEncoding = immkind; + + /// ImmOperand - This is the operand kind of an immediate of this type. For + /// example, i8 -> i8imm, i16 -> i16imm, i32 -> i32imm. Note that i64 -> + /// i64i32imm since the immediate fields of i64 instructions is a 32-bit sign + /// extended value. + Operand ImmOperand = immoperand; + + /// ImmOperator - This is the operator that should be used to match an + /// immediate of this kind in a pattern (e.g. imm, or i64immSExt32). + SDPatternOperator ImmOperator = immoperator; + + /// Imm8Operand - This is the operand kind to use for an imm8 of this type. + /// For example, i8 -> <invalid>, i16 -> i16i8imm, i32 -> i32i8imm. This is + /// only used for instructions that have a sign-extended imm8 field form. + Operand Imm8Operand = imm8operand; + + /// Imm8Operator - This is the operator that should be used to match an 8-bit + /// sign extended immediate of this kind in a pattern (e.g. imm16immSExt8). + SDPatternOperator Imm8Operator = imm8operator; + + /// HasOddOpcode - This bit is true if the instruction should have an odd (as + /// opposed to even) opcode. Operations on i8 are usually even, operations on + /// other datatypes are odd. + bit HasOddOpcode = hasOddOpcode; + + /// HasOpSizePrefix - This bit is set to true if the instruction should have + /// the 0x66 operand size prefix. This is set for i16 types. + bit HasOpSizePrefix = hasOpSizePrefix; + + /// HasREX_WPrefix - This bit is set to true if the instruction should have + /// the 0x40 REX prefix. This is set for i64 types. + bit HasREX_WPrefix = hasREX_WPrefix; +} + +def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">; + + +def Xi8 : X86TypeInfo<i8 , "b", GR8 , loadi8 , i8mem , + Imm8 , i8imm , imm, i8imm , invalid_node, + 0, 0, 0>; +def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem, + Imm16, i16imm, imm, i16i8imm, i16immSExt8, + 1, 1, 0>; +def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem, + Imm32, i32imm, imm, i32i8imm, i32immSExt8, + 1, 0, 0>; +def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem, + Imm32, i64i32imm, i64immSExt32, i64i8imm, i64immSExt8, + 1, 0, 1>; + +/// ITy - This instruction base class takes the type info for the instruction. +/// Using this, it: +/// 1. Concatenates together the instruction mnemonic with the appropriate +/// suffix letter, a tab, and the arguments. +/// 2. Infers whether the instruction should have a 0x66 prefix byte. +/// 3. Infers whether the instruction should have a 0x40 REX_W prefix. +/// 4. Infers whether the low bit of the opcode should be 0 (for i8 operations) +/// or 1 (for i16,i32,i64 operations). +class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins, + string mnemonic, string args, list<dag> pattern, + InstrItinClass itin = IIC_BIN_NONMEM> + : I<{opcode{7}, opcode{6}, opcode{5}, opcode{4}, + opcode{3}, opcode{2}, opcode{1}, typeinfo.HasOddOpcode }, + f, outs, ins, + !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern, + itin> { + + // Infer instruction prefixes from type info. + let hasOpSizePrefix = typeinfo.HasOpSizePrefix; + let hasREX_WPrefix = typeinfo.HasREX_WPrefix; +} + +// BinOpRR - Instructions like "add reg, reg, reg". +class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + dag outlist, list<dag> pattern, InstrItinClass itin, + Format f = MRMDestReg> + : ITy<opcode, f, typeinfo, outlist, + (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2), + mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>, + Sched<[WriteALU]>; + +// BinOpRR_R - Instructions like "add reg, reg, reg", where the pattern has +// just a regclass (no eflags) as a result. +class BinOpRR_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, + (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))], + IIC_BIN_NONMEM>; + +// BinOpRR_F - Instructions like "cmp reg, Reg", where the pattern has +// just a EFLAGS as a result. +class BinOpRR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f = MRMDestReg> + : BinOpRR<opcode, mnemonic, typeinfo, (outs), + [(set EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))], + IIC_BIN_NONMEM, f>; + +// BinOpRR_RF - Instructions like "add reg, reg, reg", where the pattern has +// both a regclass and EFLAGS as a result. +class BinOpRR_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))], + IIC_BIN_NONMEM>; + +// BinOpRR_RFF - Instructions like "adc reg, reg, reg", where the pattern has +// both a regclass and EFLAGS as a result, and has EFLAGS as input. +class BinOpRR_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2, + EFLAGS))], IIC_BIN_NONMEM>; + +// BinOpRR_Rev - Instructions like "add reg, reg, reg" (reversed encoding). +class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo> + : ITy<opcode, MRMSrcReg, typeinfo, + (outs typeinfo.RegClass:$dst), + (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2), + mnemonic, "{$src2, $dst|$dst, $src2}", [], IIC_BIN_NONMEM>, + Sched<[WriteALU]> { + // The disassembler should know about this, but not the asmparser. + let isCodeGenOnly = 1; + let hasSideEffects = 0; +} + +// BinOpRR_F_Rev - Instructions like "cmp reg, reg" (reversed encoding). +class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo> + : ITy<opcode, MRMSrcReg, typeinfo, (outs), + (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2), + mnemonic, "{$src2, $src1|$src1, $src2}", [], IIC_BIN_NONMEM>, + Sched<[WriteALU]> { + // The disassembler should know about this, but not the asmparser. + let isCodeGenOnly = 1; + let hasSideEffects = 0; +} + +// BinOpRM - Instructions like "add reg, reg, [mem]". +class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + dag outlist, list<dag> pattern> + : ITy<opcode, MRMSrcMem, typeinfo, outlist, + (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2), + mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>, + Sched<[WriteALULd, ReadAfterLd]>; + +// BinOpRM_R - Instructions like "add reg, reg, [mem]". +class BinOpRM_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, + (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>; + +// BinOpRM_F - Instructions like "cmp reg, [mem]". +class BinOpRM_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode> + : BinOpRM<opcode, mnemonic, typeinfo, (outs), + [(set EFLAGS, + (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>; + +// BinOpRM_RF - Instructions like "add reg, reg, [mem]". +class BinOpRM_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>; + +// BinOpRM_RFF - Instructions like "adc reg, reg, [mem]". +class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2), + EFLAGS))]>; + +// BinOpRI - Instructions like "add reg, reg, imm". +class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Format f, dag outlist, list<dag> pattern> + : ITy<opcode, f, typeinfo, outlist, + (ins typeinfo.RegClass:$src1, typeinfo.ImmOperand:$src2), + mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>, + Sched<[WriteALU]> { + let ImmT = typeinfo.ImmEncoding; +} + +// BinOpRI_R - Instructions like "add reg, reg, imm". +class BinOpRI_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, + (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>; + +// BinOpRI_F - Instructions like "cmp reg, imm". +class BinOpRI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f> + : BinOpRI<opcode, mnemonic, typeinfo, f, (outs), + [(set EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>; + +// BinOpRI_RF - Instructions like "add reg, reg, imm". +class BinOpRI_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>; +// BinOpRI_RFF - Instructions like "adc reg, reg, imm". +class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2, + EFLAGS))]>; + +// BinOpRI8 - Instructions like "add reg, reg, imm8". +class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Format f, dag outlist, list<dag> pattern> + : ITy<opcode, f, typeinfo, outlist, + (ins typeinfo.RegClass:$src1, typeinfo.Imm8Operand:$src2), + mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>, + Sched<[WriteALU]> { + let ImmT = Imm8; // Always 8-bit immediate. +} + +// BinOpRI8_R - Instructions like "add reg, reg, imm8". +class BinOpRI8_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, + (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>; + +// BinOpRI8_F - Instructions like "cmp reg, imm8". +class BinOpRI8_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs), + [(set EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>; + +// BinOpRI8_RF - Instructions like "add reg, reg, imm8". +class BinOpRI8_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>; + +// BinOpRI8_RFF - Instructions like "adc reg, reg, imm8". +class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2, + EFLAGS))]>; + +// BinOpMR - Instructions like "add [mem], reg". +class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + list<dag> pattern> + : ITy<opcode, MRMDestMem, typeinfo, + (outs), (ins typeinfo.MemOperand:$dst, typeinfo.RegClass:$src), + mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>, + Sched<[WriteALULd, WriteRMW]>; + +// BinOpMR_RMW - Instructions like "add [mem], reg". +class BinOpMR_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpMR<opcode, mnemonic, typeinfo, + [(store (opnode (load addr:$dst), typeinfo.RegClass:$src), addr:$dst), + (implicit EFLAGS)]>; + +// BinOpMR_RMW_FF - Instructions like "adc [mem], reg". +class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpMR<opcode, mnemonic, typeinfo, + [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS), + addr:$dst), + (implicit EFLAGS)]>; + +// BinOpMR_F - Instructions like "cmp [mem], reg". +class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpMR<opcode, mnemonic, typeinfo, + [(set EFLAGS, (opnode (load addr:$dst), typeinfo.RegClass:$src))]>; + +// BinOpMI - Instructions like "add [mem], imm". +class BinOpMI<string mnemonic, X86TypeInfo typeinfo, + Format f, list<dag> pattern, bits<8> opcode = 0x80> + : ITy<opcode, f, typeinfo, + (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src), + mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>, + Sched<[WriteALULd, WriteRMW]> { + let ImmT = typeinfo.ImmEncoding; +} + +// BinOpMI_RMW - Instructions like "add [mem], imm". +class BinOpMI_RMW<string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpMI<mnemonic, typeinfo, f, + [(store (opnode (typeinfo.VT (load addr:$dst)), + typeinfo.ImmOperator:$src), addr:$dst), + (implicit EFLAGS)]>; +// BinOpMI_RMW_FF - Instructions like "adc [mem], imm". +class BinOpMI_RMW_FF<string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpMI<mnemonic, typeinfo, f, + [(store (opnode (typeinfo.VT (load addr:$dst)), + typeinfo.ImmOperator:$src, EFLAGS), addr:$dst), + (implicit EFLAGS)]>; + +// BinOpMI_F - Instructions like "cmp [mem], imm". +class BinOpMI_F<string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f, bits<8> opcode = 0x80> + : BinOpMI<mnemonic, typeinfo, f, + [(set EFLAGS, (opnode (typeinfo.VT (load addr:$dst)), + typeinfo.ImmOperator:$src))], + opcode>; + +// BinOpMI8 - Instructions like "add [mem], imm8". +class BinOpMI8<string mnemonic, X86TypeInfo typeinfo, + Format f, list<dag> pattern> + : ITy<0x82, f, typeinfo, + (outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src), + mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>, + Sched<[WriteALULd, WriteRMW]> { + let ImmT = Imm8; // Always 8-bit immediate. +} + +// BinOpMI8_RMW - Instructions like "add [mem], imm8". +class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpMI8<mnemonic, typeinfo, f, + [(store (opnode (load addr:$dst), + typeinfo.Imm8Operator:$src), addr:$dst), + (implicit EFLAGS)]>; + +// BinOpMI8_RMW_FF - Instructions like "adc [mem], imm8". +class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpMI8<mnemonic, typeinfo, f, + [(store (opnode (load addr:$dst), + typeinfo.Imm8Operator:$src, EFLAGS), addr:$dst), + (implicit EFLAGS)]>; + +// BinOpMI8_F - Instructions like "cmp [mem], imm8". +class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpMI8<mnemonic, typeinfo, f, + [(set EFLAGS, (opnode (load addr:$dst), + typeinfo.Imm8Operator:$src))]>; + +// BinOpAI - Instructions like "add %eax, %eax, imm". +class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Register areg, string operands> + : ITy<opcode, RawFrm, typeinfo, + (outs), (ins typeinfo.ImmOperand:$src), + mnemonic, operands, []>, Sched<[WriteALU]> { + let ImmT = typeinfo.ImmEncoding; + let Uses = [areg]; + let Defs = [areg]; + let hasSideEffects = 0; +} + +/// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is +/// defined with "(set GPR:$dst, EFLAGS, (...". +/// +/// It would be nice to get rid of the second and third argument here, but +/// tblgen can't handle dependent type references aggressively enough: PR8330 +multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, + string mnemonic, Format RegMRM, Format MemMRM, + SDNode opnodeflag, SDNode opnode, + bit CommutableRR, bit ConvertibleToThreeAddress> { + let Defs = [EFLAGS] in { + let Constraints = "$src1 = $dst" in { + let isCommutable = CommutableRR, + isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + def NAME#8rr : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>; + def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>; + def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>; + def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>; + } // isCommutable + + def NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>; + def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>; + def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>; + def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>; + + def NAME#8rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>; + def NAME#16rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>; + def NAME#32rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>; + def NAME#64rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>; + + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + // NOTE: These are order specific, we want the ri8 forms to be listed + // first so that they are slightly preferred to the ri forms. + def NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, opnodeflag, RegMRM>; + def NAME#32ri8 : BinOpRI8_RF<0x82, mnemonic, Xi32, opnodeflag, RegMRM>; + def NAME#64ri8 : BinOpRI8_RF<0x82, mnemonic, Xi64, opnodeflag, RegMRM>; + + def NAME#8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>; + def NAME#16ri : BinOpRI_RF<0x80, mnemonic, Xi16, opnodeflag, RegMRM>; + def NAME#32ri : BinOpRI_RF<0x80, mnemonic, Xi32, opnodeflag, RegMRM>; + def NAME#64ri32: BinOpRI_RF<0x80, mnemonic, Xi64, opnodeflag, RegMRM>; + } + } // Constraints = "$src1 = $dst" + + def NAME#8mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>; + def NAME#16mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>; + + // NOTE: These are order specific, we want the mi8 forms to be listed + // first so that they are slightly preferred to the mi forms. + def NAME#16mi8 : BinOpMI8_RMW<mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi8 : BinOpMI8_RMW<mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi8 : BinOpMI8_RMW<mnemonic, Xi64, opnode, MemMRM>; + + def NAME#8mi : BinOpMI_RMW<mnemonic, Xi8 , opnode, MemMRM>; + def NAME#16mi : BinOpMI_RMW<mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi : BinOpMI_RMW<mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi32 : BinOpMI_RMW<mnemonic, Xi64, opnode, MemMRM>; + + def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|AL, $src}">; + def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|AX, $src}">; + def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|EAX, $src}">; + def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|RAX, $src}">; + } +} + +/// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is +/// defined with "(set GPR:$dst, EFLAGS, (node LHS, RHS, EFLAGS))" like ADC and +/// SBB. +/// +/// It would be nice to get rid of the second and third argument here, but +/// tblgen can't handle dependent type references aggressively enough: PR8330 +multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, + string mnemonic, Format RegMRM, Format MemMRM, + SDNode opnode, bit CommutableRR, + bit ConvertibleToThreeAddress> { + let Defs = [EFLAGS] in { + let Constraints = "$src1 = $dst" in { + let isCommutable = CommutableRR, + isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + def NAME#8rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi8 , opnode>; + def NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>; + } // isCommutable + + def NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>; + def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>; + def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>; + def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>; + + def NAME#8rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>; + def NAME#16rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>; + def NAME#32rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi32, opnode>; + def NAME#64rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi64, opnode>; + + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + // NOTE: These are order specific, we want the ri8 forms to be listed + // first so that they are slightly preferred to the ri forms. + def NAME#16ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi16, opnode, RegMRM>; + def NAME#32ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi32, opnode, RegMRM>; + def NAME#64ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi64, opnode, RegMRM>; + + def NAME#8ri : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>; + def NAME#16ri : BinOpRI_RFF<0x80, mnemonic, Xi16, opnode, RegMRM>; + def NAME#32ri : BinOpRI_RFF<0x80, mnemonic, Xi32, opnode, RegMRM>; + def NAME#64ri32: BinOpRI_RFF<0x80, mnemonic, Xi64, opnode, RegMRM>; + } + } // Constraints = "$src1 = $dst" + + def NAME#8mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi8 , opnode>; + def NAME#16mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi64, opnode>; + + // NOTE: These are order specific, we want the mi8 forms to be listed + // first so that they are slightly preferred to the mi forms. + def NAME#16mi8 : BinOpMI8_RMW_FF<mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi8 : BinOpMI8_RMW_FF<mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi8 : BinOpMI8_RMW_FF<mnemonic, Xi64, opnode, MemMRM>; + + def NAME#8mi : BinOpMI_RMW_FF<mnemonic, Xi8 , opnode, MemMRM>; + def NAME#16mi : BinOpMI_RMW_FF<mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi : BinOpMI_RMW_FF<mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi32 : BinOpMI_RMW_FF<mnemonic, Xi64, opnode, MemMRM>; + + def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|AL, $src}">; + def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|AX, $src}">; + def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|EAX, $src}">; + def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|RAX, $src}">; + } +} + +/// ArithBinOp_F - This is an arithmetic binary operator where the pattern is +/// defined with "(set EFLAGS, (...". It would be really nice to find a way +/// to factor this with the other ArithBinOp_*. +/// +multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, + string mnemonic, Format RegMRM, Format MemMRM, + SDNode opnode, + bit CommutableRR, bit ConvertibleToThreeAddress> { + let Defs = [EFLAGS] in { + let isCommutable = CommutableRR, + isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + def NAME#8rr : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>; + def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>; + } // isCommutable + + def NAME#8rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>; + def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>; + def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>; + def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>; + + def NAME#8rm : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>; + def NAME#16rm : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>; + def NAME#32rm : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>; + def NAME#64rm : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>; + + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + // NOTE: These are order specific, we want the ri8 forms to be listed + // first so that they are slightly preferred to the ri forms. + def NAME#16ri8 : BinOpRI8_F<0x82, mnemonic, Xi16, opnode, RegMRM>; + def NAME#32ri8 : BinOpRI8_F<0x82, mnemonic, Xi32, opnode, RegMRM>; + def NAME#64ri8 : BinOpRI8_F<0x82, mnemonic, Xi64, opnode, RegMRM>; + + def NAME#8ri : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>; + def NAME#16ri : BinOpRI_F<0x80, mnemonic, Xi16, opnode, RegMRM>; + def NAME#32ri : BinOpRI_F<0x80, mnemonic, Xi32, opnode, RegMRM>; + def NAME#64ri32: BinOpRI_F<0x80, mnemonic, Xi64, opnode, RegMRM>; + } + + def NAME#8mr : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>; + def NAME#16mr : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32mr : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64mr : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>; + + // NOTE: These are order specific, we want the mi8 forms to be listed + // first so that they are slightly preferred to the mi forms. + def NAME#16mi8 : BinOpMI8_F<mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi8 : BinOpMI8_F<mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi8 : BinOpMI8_F<mnemonic, Xi64, opnode, MemMRM>; + + def NAME#8mi : BinOpMI_F<mnemonic, Xi8 , opnode, MemMRM>; + def NAME#16mi : BinOpMI_F<mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi : BinOpMI_F<mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi32 : BinOpMI_F<mnemonic, Xi64, opnode, MemMRM>; + + def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|AL, $src}">; + def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|AX, $src}">; + def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|EAX, $src}">; + def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|RAX, $src}">; + } +} + + +defm AND : ArithBinOp_RF<0x20, 0x22, 0x24, "and", MRM4r, MRM4m, + X86and_flag, and, 1, 0>; +defm OR : ArithBinOp_RF<0x08, 0x0A, 0x0C, "or", MRM1r, MRM1m, + X86or_flag, or, 1, 0>; +defm XOR : ArithBinOp_RF<0x30, 0x32, 0x34, "xor", MRM6r, MRM6m, + X86xor_flag, xor, 1, 0>; +defm ADD : ArithBinOp_RF<0x00, 0x02, 0x04, "add", MRM0r, MRM0m, + X86add_flag, add, 1, 1>; +let isCompare = 1 in { +defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m, + X86sub_flag, sub, 0, 0>; +} + +// Arithmetic. +let Uses = [EFLAGS] in { + defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag, + 1, 0>; + defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag, + 0, 0>; +} + +let isCompare = 1 in { +defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>; +} + + +//===----------------------------------------------------------------------===// +// Semantically, test instructions are similar like AND, except they don't +// generate a result. From an encoding perspective, they are very different: +// they don't have all the usual imm8 and REV forms, and are encoded into a +// different space. +def X86testpat : PatFrag<(ops node:$lhs, node:$rhs), + (X86cmp (and_su node:$lhs, node:$rhs), 0)>; + +let isCompare = 1, Defs = [EFLAGS] in { + let isCommutable = 1 in { + def TEST8rr : BinOpRR_F<0x84, "test", Xi8 , X86testpat, MRMSrcReg>; + def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat, MRMSrcReg>; + def TEST32rr : BinOpRR_F<0x84, "test", Xi32, X86testpat, MRMSrcReg>; + def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat, MRMSrcReg>; + } // isCommutable + + def TEST8rm : BinOpRM_F<0x84, "test", Xi8 , X86testpat>; + def TEST16rm : BinOpRM_F<0x84, "test", Xi16, X86testpat>; + def TEST32rm : BinOpRM_F<0x84, "test", Xi32, X86testpat>; + def TEST64rm : BinOpRM_F<0x84, "test", Xi64, X86testpat>; + + def TEST8ri : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>; + def TEST16ri : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>; + def TEST32ri : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>; + def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>; + + def TEST8mi : BinOpMI_F<"test", Xi8 , X86testpat, MRM0m, 0xF6>; + def TEST16mi : BinOpMI_F<"test", Xi16, X86testpat, MRM0m, 0xF6>; + def TEST32mi : BinOpMI_F<"test", Xi32, X86testpat, MRM0m, 0xF6>; + def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>; + + def TEST8i8 : BinOpAI<0xA8, "test", Xi8 , AL, + "{$src, %al|AL, $src}">; + def TEST16i16 : BinOpAI<0xA8, "test", Xi16, AX, + "{$src, %ax|AX, $src}">; + def TEST32i32 : BinOpAI<0xA8, "test", Xi32, EAX, + "{$src, %eax|EAX, $src}">; + def TEST64i32 : BinOpAI<0xA8, "test", Xi64, RAX, + "{$src, %rax|RAX, $src}">; + + // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the + // register class is constrained to GR8_NOREX. + let isPseudo = 1 in + def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask), + "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>; +} + +//===----------------------------------------------------------------------===// +// ANDN Instruction +// +multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop, + PatFrag ld_frag> { + def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))], + IIC_BIN_NONMEM>, Sched<[WriteALU]>; + def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, EFLAGS, + (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))], IIC_BIN_MEM>, + Sched<[WriteALULd, ReadAfterLd]>; +} + +let Predicates = [HasBMI], Defs = [EFLAGS] in { + defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32>, T8, VEX_4V; + defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8, VEX_4V, VEX_W; +} + +let Predicates = [HasBMI] in { + def : Pat<(and (not GR32:$src1), GR32:$src2), + (ANDN32rr GR32:$src1, GR32:$src2)>; + def : Pat<(and (not GR64:$src1), GR64:$src2), + (ANDN64rr GR64:$src1, GR64:$src2)>; + def : Pat<(and (not GR32:$src1), (loadi32 addr:$src2)), + (ANDN32rm GR32:$src1, addr:$src2)>; + def : Pat<(and (not GR64:$src1), (loadi64 addr:$src2)), + (ANDN64rm GR64:$src1, addr:$src2)>; +} + +//===----------------------------------------------------------------------===// +// MULX Instruction +// +multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop> { +let neverHasSideEffects = 1 in { + let isCommutable = 1 in + def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src), + !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"), + [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMul]>; + + let mayLoad = 1 in + def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src), + !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"), + [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMulLd]>; +} +} + +let Predicates = [HasBMI2] in { + let Uses = [EDX] in + defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem>; + let Uses = [RDX] in + defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem>, VEX_W; +} + +//===----------------------------------------------------------------------===// +// ADCX Instruction +// +let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in { + let SchedRW = [WriteALU] in { + def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "adcx{l}\t{$src, $dst|$dst, $src}", + [], IIC_BIN_NONMEM>, T8, OpSize; + + def ADCX64rr : I<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "adcx{q}\t{$src, $dst|$dst, $src}", + [], IIC_BIN_NONMEM>, T8, OpSize, REX_W, Requires<[In64BitMode]>; + } // SchedRW + + let mayLoad = 1, SchedRW = [WriteALULd] in { + def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "adcx{l}\t{$src, $dst|$dst, $src}", + [], IIC_BIN_MEM>, T8, OpSize; + + def ADCX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "adcx{q}\t{$src, $dst|$dst, $src}", + [], IIC_BIN_MEM>, T8, OpSize, REX_W, Requires<[In64BitMode]>; + } +} + +//===----------------------------------------------------------------------===// +// ADOX Instruction +// +let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in { + let SchedRW = [WriteALU] in { + def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "adox{l}\t{$src, $dst|$dst, $src}", + [], IIC_BIN_NONMEM>, T8XS; + + def ADOX64rr : I<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "adox{q}\t{$src, $dst|$dst, $src}", + [], IIC_BIN_NONMEM>, T8XS, REX_W, Requires<[In64BitMode]>; + } // SchedRW + + let mayLoad = 1, SchedRW = [WriteALULd] in { + def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "adox{l}\t{$src, $dst|$dst, $src}", + [], IIC_BIN_MEM>, T8XS; + + def ADOX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "adox{q}\t{$src, $dst|$dst, $src}", + [], IIC_BIN_MEM>, T8XS, REX_W, Requires<[In64BitMode]>; + } +} diff --git a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h new file mode 100644 index 0000000..aaef4a4 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h @@ -0,0 +1,183 @@ +//===-- X86InstrBuilder.h - Functions to aid building x86 insts -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file exposes functions that may be used with BuildMI from the +// MachineInstrBuilder.h file to handle X86'isms in a clean way. +// +// The BuildMem function may be used with the BuildMI function to add entire +// memory references in a single, typed, function call. X86 memory references +// can be very complex expressions (described in the README), so wrapping them +// up behind an easier to use interface makes sense. Descriptions of the +// functions are included below. +// +// For reference, the order of operands for memory references is: +// (Operand), Base, Scale, Index, Displacement. +// +//===----------------------------------------------------------------------===// + +#ifndef X86INSTRBUILDER_H +#define X86INSTRBUILDER_H + +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" + +namespace llvm { + +/// X86AddressMode - This struct holds a generalized full x86 address mode. +/// The base register can be a frame index, which will eventually be replaced +/// with BP or SP and Disp being offsetted accordingly. The displacement may +/// also include the offset of a global value. +struct X86AddressMode { + enum { + RegBase, + FrameIndexBase + } BaseType; + + union { + unsigned Reg; + int FrameIndex; + } Base; + + unsigned Scale; + unsigned IndexReg; + int Disp; + const GlobalValue *GV; + unsigned GVOpFlags; + + X86AddressMode() + : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(0), GVOpFlags(0) { + Base.Reg = 0; + } + + + void getFullAddress(SmallVectorImpl<MachineOperand> &MO) { + assert(Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8); + + if (BaseType == X86AddressMode::RegBase) + MO.push_back(MachineOperand::CreateReg(Base.Reg, false, false, + false, false, false, 0, false)); + else { + assert(BaseType == X86AddressMode::FrameIndexBase); + MO.push_back(MachineOperand::CreateFI(Base.FrameIndex)); + } + + MO.push_back(MachineOperand::CreateImm(Scale)); + MO.push_back(MachineOperand::CreateReg(IndexReg, false, false, + false, false, false, 0, false)); + + if (GV) + MO.push_back(MachineOperand::CreateGA(GV, Disp, GVOpFlags)); + else + MO.push_back(MachineOperand::CreateImm(Disp)); + + MO.push_back(MachineOperand::CreateReg(0, false, false, + false, false, false, 0, false)); + } +}; + +/// addDirectMem - This function is used to add a direct memory reference to the +/// current instruction -- that is, a dereference of an address in a register, +/// with no scale, index or displacement. An example is: DWORD PTR [EAX]. +/// +static inline const MachineInstrBuilder & +addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) { + // Because memory references are always represented with five + // values, this adds: Reg, 1, NoReg, 0, NoReg to the instruction. + return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0).addReg(0); +} + + +static inline const MachineInstrBuilder & +addOffset(const MachineInstrBuilder &MIB, int Offset) { + return MIB.addImm(1).addReg(0).addImm(Offset).addReg(0); +} + +/// addRegOffset - This function is used to add a memory reference of the form +/// [Reg + Offset], i.e., one with no scale or index, but with a +/// displacement. An example is: DWORD PTR [EAX + 4]. +/// +static inline const MachineInstrBuilder & +addRegOffset(const MachineInstrBuilder &MIB, + unsigned Reg, bool isKill, int Offset) { + return addOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset); +} + +/// addRegReg - This function is used to add a memory reference of the form: +/// [Reg + Reg]. +static inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB, + unsigned Reg1, bool isKill1, + unsigned Reg2, bool isKill2) { + return MIB.addReg(Reg1, getKillRegState(isKill1)).addImm(1) + .addReg(Reg2, getKillRegState(isKill2)).addImm(0).addReg(0); +} + +static inline const MachineInstrBuilder & +addFullAddress(const MachineInstrBuilder &MIB, + const X86AddressMode &AM) { + assert(AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8); + + if (AM.BaseType == X86AddressMode::RegBase) + MIB.addReg(AM.Base.Reg); + else { + assert(AM.BaseType == X86AddressMode::FrameIndexBase); + MIB.addFrameIndex(AM.Base.FrameIndex); + } + + MIB.addImm(AM.Scale).addReg(AM.IndexReg); + if (AM.GV) + MIB.addGlobalAddress(AM.GV, AM.Disp, AM.GVOpFlags); + else + MIB.addImm(AM.Disp); + + return MIB.addReg(0); +} + +/// addFrameReference - This function is used to add a reference to the base of +/// an abstract object on the stack frame of the current function. This +/// reference has base register as the FrameIndex offset until it is resolved. +/// This allows a constant offset to be specified as well... +/// +static inline const MachineInstrBuilder & +addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) { + MachineInstr *MI = MIB; + MachineFunction &MF = *MI->getParent()->getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + const MCInstrDesc &MCID = MI->getDesc(); + unsigned Flags = 0; + if (MCID.mayLoad()) + Flags |= MachineMemOperand::MOLoad; + if (MCID.mayStore()) + Flags |= MachineMemOperand::MOStore; + MachineMemOperand *MMO = + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI, Offset), + Flags, MFI.getObjectSize(FI), + MFI.getObjectAlignment(FI)); + return addOffset(MIB.addFrameIndex(FI), Offset) + .addMemOperand(MMO); +} + +/// addConstantPoolReference - This function is used to add a reference to the +/// base of a constant value spilled to the per-function constant pool. The +/// reference uses the abstract ConstantPoolIndex which is retained until +/// either machine code emission or assembly output. In PIC mode on x86-32, +/// the GlobalBaseReg parameter can be used to make this a +/// GlobalBaseReg-relative reference. +/// +static inline const MachineInstrBuilder & +addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI, + unsigned GlobalBaseReg, unsigned char OpFlags) { + //FIXME: factor this + return MIB.addReg(GlobalBaseReg).addImm(1).addReg(0) + .addConstantPoolIndex(CPI, 0, OpFlags).addReg(0); +} + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td new file mode 100644 index 0000000..a967a4d --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td @@ -0,0 +1,111 @@ +//===-- X86InstrCMovSetCC.td - Conditional Move and SetCC --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 conditional move and set on condition +// instructions. +// +//===----------------------------------------------------------------------===// + + +// SetCC instructions. +multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> { + let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", + isCommutable = 1, SchedRW = [WriteALU] in { + def NAME#16rr + : I<opc, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), + !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"), + [(set GR16:$dst, + (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))], + IIC_CMOV16_RR>,TB,OpSize; + def NAME#32rr + : I<opc, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), + !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"), + [(set GR32:$dst, + (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))], + IIC_CMOV32_RR>, TB; + def NAME#64rr + :RI<opc, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), + !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"), + [(set GR64:$dst, + (X86cmov GR64:$src1, GR64:$src2, CondNode, EFLAGS))], + IIC_CMOV32_RR>, TB; + } + + let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", + SchedRW = [WriteALULd, ReadAfterLd] in { + def NAME#16rm + : I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2), + !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"), + [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), + CondNode, EFLAGS))], IIC_CMOV16_RM>, + TB, OpSize; + def NAME#32rm + : I<opc, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), + !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"), + [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), + CondNode, EFLAGS))], IIC_CMOV32_RM>, TB; + def NAME#64rm + :RI<opc, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2), + !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"), + [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), + CondNode, EFLAGS))], IIC_CMOV32_RM>, TB; + } // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst" +} // end multiclass + + +// Conditional Moves. +defm CMOVO : CMOV<0x40, "cmovo" , X86_COND_O>; +defm CMOVNO : CMOV<0x41, "cmovno", X86_COND_NO>; +defm CMOVB : CMOV<0x42, "cmovb" , X86_COND_B>; +defm CMOVAE : CMOV<0x43, "cmovae", X86_COND_AE>; +defm CMOVE : CMOV<0x44, "cmove" , X86_COND_E>; +defm CMOVNE : CMOV<0x45, "cmovne", X86_COND_NE>; +defm CMOVBE : CMOV<0x46, "cmovbe", X86_COND_BE>; +defm CMOVA : CMOV<0x47, "cmova" , X86_COND_A>; +defm CMOVS : CMOV<0x48, "cmovs" , X86_COND_S>; +defm CMOVNS : CMOV<0x49, "cmovns", X86_COND_NS>; +defm CMOVP : CMOV<0x4A, "cmovp" , X86_COND_P>; +defm CMOVNP : CMOV<0x4B, "cmovnp", X86_COND_NP>; +defm CMOVL : CMOV<0x4C, "cmovl" , X86_COND_L>; +defm CMOVGE : CMOV<0x4D, "cmovge", X86_COND_GE>; +defm CMOVLE : CMOV<0x4E, "cmovle", X86_COND_LE>; +defm CMOVG : CMOV<0x4F, "cmovg" , X86_COND_G>; + + +// SetCC instructions. +multiclass SETCC<bits<8> opc, string Mnemonic, PatLeaf OpNode> { + let Uses = [EFLAGS] in { + def r : I<opc, MRM0r, (outs GR8:$dst), (ins), + !strconcat(Mnemonic, "\t$dst"), + [(set GR8:$dst, (X86setcc OpNode, EFLAGS))], + IIC_SET_R>, TB, Sched<[WriteALU]>; + def m : I<opc, MRM0m, (outs), (ins i8mem:$dst), + !strconcat(Mnemonic, "\t$dst"), + [(store (X86setcc OpNode, EFLAGS), addr:$dst)], + IIC_SET_M>, TB, Sched<[WriteALU, WriteStore]>; + } // Uses = [EFLAGS] +} + +defm SETO : SETCC<0x90, "seto", X86_COND_O>; // is overflow bit set +defm SETNO : SETCC<0x91, "setno", X86_COND_NO>; // is overflow bit not set +defm SETB : SETCC<0x92, "setb", X86_COND_B>; // unsigned less than +defm SETAE : SETCC<0x93, "setae", X86_COND_AE>; // unsigned greater or equal +defm SETE : SETCC<0x94, "sete", X86_COND_E>; // equal to +defm SETNE : SETCC<0x95, "setne", X86_COND_NE>; // not equal to +defm SETBE : SETCC<0x96, "setbe", X86_COND_BE>; // unsigned less than or equal +defm SETA : SETCC<0x97, "seta", X86_COND_A>; // unsigned greater than +defm SETS : SETCC<0x98, "sets", X86_COND_S>; // is signed bit set +defm SETNS : SETCC<0x99, "setns", X86_COND_NS>; // is not signed +defm SETP : SETCC<0x9A, "setp", X86_COND_P>; // is parity bit set +defm SETNP : SETCC<0x9B, "setnp", X86_COND_NP>; // is parity bit not set +defm SETL : SETCC<0x9C, "setl", X86_COND_L>; // signed less than +defm SETGE : SETCC<0x9D, "setge", X86_COND_GE>; // signed greater or equal +defm SETLE : SETCC<0x9E, "setle", X86_COND_LE>; // signed less than or equal +defm SETG : SETCC<0x9F, "setg", X86_COND_G>; // signed greater than + diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td new file mode 100644 index 0000000..d9ff0c6 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td @@ -0,0 +1,1839 @@ +//===- X86InstrCompiler.td - Compiler Pseudos and Patterns -*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the various pseudo instructions used by the compiler, +// as well as Pat patterns used during instruction selection. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Pattern Matching Support + +def GetLo32XForm : SDNodeXForm<imm, [{ + // Transformation function: get the low 32 bits. + return getI32Imm((unsigned)N->getZExtValue()); +}]>; + +def GetLo8XForm : SDNodeXForm<imm, [{ + // Transformation function: get the low 8 bits. + return getI8Imm((uint8_t)N->getZExtValue()); +}]>; + + +//===----------------------------------------------------------------------===// +// Random Pseudo Instructions. + +// PIC base construction. This expands to code that looks like this: +// call $next_inst +// popl %destreg" +let neverHasSideEffects = 1, isNotDuplicable = 1, Uses = [ESP] in + def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label), + "", []>; + + +// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into +// a stack adjustment and the codegen must know that they may modify the stack +// pointer before prolog-epilog rewriting occurs. +// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become +// sub / add which can clobber EFLAGS. +let Defs = [ESP, EFLAGS], Uses = [ESP] in { +def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt), + "#ADJCALLSTACKDOWN", + [(X86callseq_start timm:$amt)]>, + Requires<[In32BitMode]>; +def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), + "#ADJCALLSTACKUP", + [(X86callseq_end timm:$amt1, timm:$amt2)]>, + Requires<[In32BitMode]>; +} + +// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into +// a stack adjustment and the codegen must know that they may modify the stack +// pointer before prolog-epilog rewriting occurs. +// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become +// sub / add which can clobber EFLAGS. +let Defs = [RSP, EFLAGS], Uses = [RSP] in { +def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt), + "#ADJCALLSTACKDOWN", + [(X86callseq_start timm:$amt)]>, + Requires<[In64BitMode]>; +def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), + "#ADJCALLSTACKUP", + [(X86callseq_end timm:$amt1, timm:$amt2)]>, + Requires<[In64BitMode]>; +} + + + +// x86-64 va_start lowering magic. +let usesCustomInserter = 1 in { +def VASTART_SAVE_XMM_REGS : I<0, Pseudo, + (outs), + (ins GR8:$al, + i64imm:$regsavefi, i64imm:$offset, + variable_ops), + "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset", + [(X86vastart_save_xmm_regs GR8:$al, + imm:$regsavefi, + imm:$offset)]>; + +// The VAARG_64 pseudo-instruction takes the address of the va_list, +// and places the address of the next argument into a register. +let Defs = [EFLAGS] in +def VAARG_64 : I<0, Pseudo, + (outs GR64:$dst), + (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align), + "#VAARG_64 $dst, $ap, $size, $mode, $align", + [(set GR64:$dst, + (X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)), + (implicit EFLAGS)]>; + +// Dynamic stack allocation yields a _chkstk or _alloca call for all Windows +// targets. These calls are needed to probe the stack when allocating more than +// 4k bytes in one go. Touching the stack at 4K increments is necessary to +// ensure that the guard pages used by the OS virtual memory manager are +// allocated in correct sequence. +// The main point of having separate instruction are extra unmodelled effects +// (compared to ordinary calls) like stack pointer change. + +let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in + def WIN_ALLOCA : I<0, Pseudo, (outs), (ins), + "# dynamic stack allocation", + [(X86WinAlloca)]>; + +// When using segmented stacks these are lowered into instructions which first +// check if the current stacklet has enough free memory. If it does, memory is +// allocated by bumping the stack pointer. Otherwise memory is allocated from +// the heap. + +let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in +def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size), + "# variable sized alloca for segmented stacks", + [(set GR32:$dst, + (X86SegAlloca GR32:$size))]>, + Requires<[In32BitMode]>; + +let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in +def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size), + "# variable sized alloca for segmented stacks", + [(set GR64:$dst, + (X86SegAlloca GR64:$size))]>, + Requires<[In64BitMode]>; +} + +// The MSVC runtime contains an _ftol2 routine for converting floating-point +// to integer values. It has a strange calling convention: the input is +// popped from the x87 stack, and the return value is given in EDX:EAX. No +// other registers (aside from flags) are touched. +// Microsoft toolchains do not support 80-bit precision, so a WIN_FTOL_80 +// variant is unnecessary. + +let Defs = [EAX, EDX, EFLAGS], FPForm = SpecialFP in { + def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src), + "# win32 fptoui", + [(X86WinFTOL RFP32:$src)]>, + Requires<[In32BitMode]>; + + def WIN_FTOL_64 : I<0, Pseudo, (outs), (ins RFP64:$src), + "# win32 fptoui", + [(X86WinFTOL RFP64:$src)]>, + Requires<[In32BitMode]>; +} + +//===----------------------------------------------------------------------===// +// EH Pseudo Instructions +// +let SchedRW = [WriteSystem] in { +let isTerminator = 1, isReturn = 1, isBarrier = 1, + hasCtrlDep = 1, isCodeGenOnly = 1 in { +def EH_RETURN : I<0xC3, RawFrm, (outs), (ins GR32:$addr), + "ret\t#eh_return, addr: $addr", + [(X86ehret GR32:$addr)], IIC_RET>, Sched<[WriteJumpLd]>; + +} + +let isTerminator = 1, isReturn = 1, isBarrier = 1, + hasCtrlDep = 1, isCodeGenOnly = 1 in { +def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr), + "ret\t#eh_return, addr: $addr", + [(X86ehret GR64:$addr)], IIC_RET>, Sched<[WriteJumpLd]>; + +} + +let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, + usesCustomInserter = 1 in { + def EH_SjLj_SetJmp32 : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf), + "#EH_SJLJ_SETJMP32", + [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>, + Requires<[In32BitMode]>; + def EH_SjLj_SetJmp64 : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$buf), + "#EH_SJLJ_SETJMP64", + [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>, + Requires<[In64BitMode]>; + let isTerminator = 1 in { + def EH_SjLj_LongJmp32 : I<0, Pseudo, (outs), (ins i32mem:$buf), + "#EH_SJLJ_LONGJMP32", + [(X86eh_sjlj_longjmp addr:$buf)]>, + Requires<[In32BitMode]>; + def EH_SjLj_LongJmp64 : I<0, Pseudo, (outs), (ins i64mem:$buf), + "#EH_SJLJ_LONGJMP64", + [(X86eh_sjlj_longjmp addr:$buf)]>, + Requires<[In64BitMode]>; + } +} +} // SchedRW + +let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in { + def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst), + "#EH_SjLj_Setup\t$dst", []>; +} + +//===----------------------------------------------------------------------===// +// Pseudo instructions used by segmented stacks. +// + +// This is lowered into a RET instruction by MCInstLower. We need +// this so that we don't have to have a MachineBasicBlock which ends +// with a RET and also has successors. +let isPseudo = 1 in { +def MORESTACK_RET: I<0, Pseudo, (outs), (ins), + "", []>; + +// This instruction is lowered to a RET followed by a MOV. The two +// instructions are not generated on a higher level since then the +// verifier sees a MachineBasicBlock ending with a non-terminator. +def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), + "", []>; +} + +//===----------------------------------------------------------------------===// +// Alias Instructions +//===----------------------------------------------------------------------===// + +// Alias instructions that map movr0 to xor. +// FIXME: remove when we can teach regalloc that xor reg, reg is ok. +// FIXME: Set encoding to pseudo. +let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1, + isCodeGenOnly = 1 in { +def MOV8r0 : I<0x30, MRMInitReg, (outs GR8 :$dst), (ins), "", + [(set GR8:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>; + +// We want to rewrite MOV16r0 in terms of MOV32r0, because it's a smaller +// encoding and avoids a partial-register update sometimes, but doing so +// at isel time interferes with rematerialization in the current register +// allocator. For now, this is rewritten when the instruction is lowered +// to an MCInst. +def MOV16r0 : I<0x31, MRMInitReg, (outs GR16:$dst), (ins), + "", + [(set GR16:$dst, 0)], IIC_ALU_NONMEM>, OpSize, + Sched<[WriteZero]>; + +// FIXME: Set encoding to pseudo. +def MOV32r0 : I<0x31, MRMInitReg, (outs GR32:$dst), (ins), "", + [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>; +} + +// We want to rewrite MOV64r0 in terms of MOV32r0, because it's sometimes a +// smaller encoding, but doing so at isel time interferes with rematerialization +// in the current register allocator. For now, this is rewritten when the +// instruction is lowered to an MCInst. +// FIXME: AddedComplexity gives this a higher priority than MOV64ri32. Remove +// when we have a better way to specify isel priority. +let Defs = [EFLAGS], isCodeGenOnly=1, + AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in +def MOV64r0 : I<0x31, MRMInitReg, (outs GR64:$dst), (ins), "", + [(set GR64:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>; + +// Materialize i64 constant where top 32-bits are zero. This could theoretically +// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however +// that would make it more difficult to rematerialize. +let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1, + isCodeGenOnly = 1 in +def MOV64ri64i32 : Ii32<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64i32imm:$src), + "", [(set GR64:$dst, i64immZExt32:$src)], + IIC_ALU_NONMEM>, Sched<[WriteALU]>; + +// Use sbb to materialize carry bit. +let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in { +// FIXME: These are pseudo ops that should be replaced with Pat<> patterns. +// However, Pat<> can't replicate the destination reg into the inputs of the +// result. +def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "", + [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; +def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "", + [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; +def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "", + [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; +def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "", + [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; +} // isCodeGenOnly + + +def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C16r)>; +def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C32r)>; +def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C64r)>; + +def : Pat<(i16 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C16r)>; +def : Pat<(i32 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C32r)>; +def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C64r)>; + +// We canonicalize 'setb' to "(and (sbb reg,reg), 1)" on the hope that the and +// will be eliminated and that the sbb can be extended up to a wider type. When +// this happens, it is great. However, if we are left with an 8-bit sbb and an +// and, we might as well just match it as a setb. +def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), + (SETBr)>; + +// (add OP, SETB) -> (adc OP, 0) +def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op), + (ADC8ri GR8:$op, 0)>; +def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op), + (ADC32ri8 GR32:$op, 0)>; +def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op), + (ADC64ri8 GR64:$op, 0)>; + +// (sub OP, SETB) -> (sbb OP, 0) +def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB8ri GR8:$op, 0)>; +def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB32ri8 GR32:$op, 0)>; +def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB64ri8 GR64:$op, 0)>; + +// (sub OP, SETCC_CARRY) -> (adc OP, 0) +def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC8ri GR8:$op, 0)>; +def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC32ri8 GR32:$op, 0)>; +def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC64ri8 GR64:$op, 0)>; + +//===----------------------------------------------------------------------===// +// String Pseudo Instructions +// +let SchedRW = [WriteMicrocoded] in { +let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in { +def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}", + [(X86rep_movs i8)], IIC_REP_MOVS>, REP, + Requires<[In32BitMode]>; +def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}", + [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize, + Requires<[In32BitMode]>; +def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}", + [(X86rep_movs i32)], IIC_REP_MOVS>, REP, + Requires<[In32BitMode]>; +} + +let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in { +def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}", + [(X86rep_movs i8)], IIC_REP_MOVS>, REP, + Requires<[In64BitMode]>; +def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}", + [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize, + Requires<[In64BitMode]>; +def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}", + [(X86rep_movs i32)], IIC_REP_MOVS>, REP, + Requires<[In64BitMode]>; +def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}", + [(X86rep_movs i64)], IIC_REP_MOVS>, REP, + Requires<[In64BitMode]>; +} + +// FIXME: Should use "(X86rep_stos AL)" as the pattern. +let Defs = [ECX,EDI], isCodeGenOnly = 1 in { + let Uses = [AL,ECX,EDI] in + def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}", + [(X86rep_stos i8)], IIC_REP_STOS>, REP, + Requires<[In32BitMode]>; + let Uses = [AX,ECX,EDI] in + def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}", + [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize, + Requires<[In32BitMode]>; + let Uses = [EAX,ECX,EDI] in + def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}", + [(X86rep_stos i32)], IIC_REP_STOS>, REP, + Requires<[In32BitMode]>; +} + +let Defs = [RCX,RDI], isCodeGenOnly = 1 in { + let Uses = [AL,RCX,RDI] in + def REP_STOSB_64 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}", + [(X86rep_stos i8)], IIC_REP_STOS>, REP, + Requires<[In64BitMode]>; + let Uses = [AX,RCX,RDI] in + def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}", + [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize, + Requires<[In64BitMode]>; + let Uses = [RAX,RCX,RDI] in + def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}", + [(X86rep_stos i32)], IIC_REP_STOS>, REP, + Requires<[In64BitMode]>; + + let Uses = [RAX,RCX,RDI] in + def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}", + [(X86rep_stos i64)], IIC_REP_STOS>, REP, + Requires<[In64BitMode]>; +} +} // SchedRW + +//===----------------------------------------------------------------------===// +// Thread Local Storage Instructions +// + +// ELF TLS Support +// All calls clobber the non-callee saved registers. ESP is marked as +// a use to prevent stack-pointer assignments that appear immediately +// before calls from potentially appearing dead. +let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, + MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, + XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], + Uses = [ESP] in { +def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym), + "# TLS_addr32", + [(X86tlsaddr tls32addr:$sym)]>, + Requires<[In32BitMode]>; +def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym), + "# TLS_base_addr32", + [(X86tlsbaseaddr tls32baseaddr:$sym)]>, + Requires<[In32BitMode]>; +} + +// All calls clobber the non-callee saved registers. RSP is marked as +// a use to prevent stack-pointer assignments that appear immediately +// before calls from potentially appearing dead. +let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, + FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1, + MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, + XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], + Uses = [RSP] in { +def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), + "# TLS_addr64", + [(X86tlsaddr tls64addr:$sym)]>, + Requires<[In64BitMode]>; +def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), + "# TLS_base_addr64", + [(X86tlsbaseaddr tls64baseaddr:$sym)]>, + Requires<[In64BitMode]>; +} + +// Darwin TLS Support +// For i386, the address of the thunk is passed on the stack, on return the +// address of the variable is in %eax. %ecx is trashed during the function +// call. All other registers are preserved. +let Defs = [EAX, ECX, EFLAGS], + Uses = [ESP], + usesCustomInserter = 1 in +def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym), + "# TLSCall_32", + [(X86TLSCall addr:$sym)]>, + Requires<[In32BitMode]>; + +// For x86_64, the address of the thunk is passed in %rdi, on return +// the address of the variable is in %rax. All other registers are preserved. +let Defs = [RAX, EFLAGS], + Uses = [RSP, RDI], + usesCustomInserter = 1 in +def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym), + "# TLSCall_64", + [(X86TLSCall addr:$sym)]>, + Requires<[In64BitMode]>; + + +//===----------------------------------------------------------------------===// +// Conditional Move Pseudo Instructions + +// X86 doesn't have 8-bit conditional moves. Use a customInserter to +// emit control flow. An alternative to this is to mark i8 SELECT as Promote, +// however that requires promoting the operands, and can induce additional +// i8 register pressure. +let usesCustomInserter = 1, Uses = [EFLAGS] in { +def CMOV_GR8 : I<0, Pseudo, + (outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cond), + "#CMOV_GR8 PSEUDO!", + [(set GR8:$dst, (X86cmov GR8:$src1, GR8:$src2, + imm:$cond, EFLAGS))]>; + +let Predicates = [NoCMov] in { +def CMOV_GR32 : I<0, Pseudo, + (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$cond), + "#CMOV_GR32* PSEUDO!", + [(set GR32:$dst, + (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>; +def CMOV_GR16 : I<0, Pseudo, + (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$cond), + "#CMOV_GR16* PSEUDO!", + [(set GR16:$dst, + (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>; +} // Predicates = [NoCMov] + +// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no +// SSE1. +let Predicates = [FPStackf32] in +def CMOV_RFP32 : I<0, Pseudo, + (outs RFP32:$dst), + (ins RFP32:$src1, RFP32:$src2, i8imm:$cond), + "#CMOV_RFP32 PSEUDO!", + [(set RFP32:$dst, + (X86cmov RFP32:$src1, RFP32:$src2, imm:$cond, + EFLAGS))]>; +// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no +// SSE2. +let Predicates = [FPStackf64] in +def CMOV_RFP64 : I<0, Pseudo, + (outs RFP64:$dst), + (ins RFP64:$src1, RFP64:$src2, i8imm:$cond), + "#CMOV_RFP64 PSEUDO!", + [(set RFP64:$dst, + (X86cmov RFP64:$src1, RFP64:$src2, imm:$cond, + EFLAGS))]>; +def CMOV_RFP80 : I<0, Pseudo, + (outs RFP80:$dst), + (ins RFP80:$src1, RFP80:$src2, i8imm:$cond), + "#CMOV_RFP80 PSEUDO!", + [(set RFP80:$dst, + (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond, + EFLAGS))]>; +} // UsesCustomInserter = 1, Uses = [EFLAGS] + + +//===----------------------------------------------------------------------===// +// Atomic Instruction Pseudo Instructions +//===----------------------------------------------------------------------===// + +// Pseudo atomic instructions + +multiclass PSEUDO_ATOMIC_LOAD_BINOP<string mnemonic> { + let usesCustomInserter = 1, mayLoad = 1, mayStore = 1 in { + let Defs = [EFLAGS, AL] in + def NAME#8 : I<0, Pseudo, (outs GR8:$dst), + (ins i8mem:$ptr, GR8:$val), + !strconcat(mnemonic, "8 PSEUDO!"), []>; + let Defs = [EFLAGS, AX] in + def NAME#16 : I<0, Pseudo,(outs GR16:$dst), + (ins i16mem:$ptr, GR16:$val), + !strconcat(mnemonic, "16 PSEUDO!"), []>; + let Defs = [EFLAGS, EAX] in + def NAME#32 : I<0, Pseudo, (outs GR32:$dst), + (ins i32mem:$ptr, GR32:$val), + !strconcat(mnemonic, "32 PSEUDO!"), []>; + let Defs = [EFLAGS, RAX] in + def NAME#64 : I<0, Pseudo, (outs GR64:$dst), + (ins i64mem:$ptr, GR64:$val), + !strconcat(mnemonic, "64 PSEUDO!"), []>; + } +} + +multiclass PSEUDO_ATOMIC_LOAD_BINOP_PATS<string name, string frag> { + def : Pat<(!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val), + (!cast<Instruction>(name # "8") addr:$ptr, GR8:$val)>; + def : Pat<(!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val), + (!cast<Instruction>(name # "16") addr:$ptr, GR16:$val)>; + def : Pat<(!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val), + (!cast<Instruction>(name # "32") addr:$ptr, GR32:$val)>; + def : Pat<(!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val), + (!cast<Instruction>(name # "64") addr:$ptr, GR64:$val)>; +} + +// Atomic exchange, and, or, xor +defm ATOMAND : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMAND">; +defm ATOMOR : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMOR">; +defm ATOMXOR : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMXOR">; +defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMNAND">; +defm ATOMMAX : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMAX">; +defm ATOMMIN : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMIN">; +defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMAX">; +defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMIN">; + +defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMAND", "atomic_load_and">; +defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMOR", "atomic_load_or">; +defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMXOR", "atomic_load_xor">; +defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMNAND", "atomic_load_nand">; +defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMAX", "atomic_load_max">; +defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMIN", "atomic_load_min">; +defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMAX", "atomic_load_umax">; +defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMIN", "atomic_load_umin">; + +multiclass PSEUDO_ATOMIC_LOAD_BINOP6432<string mnemonic> { + let usesCustomInserter = 1, Defs = [EFLAGS, EAX, EDX], + mayLoad = 1, mayStore = 1, hasSideEffects = 0 in + def NAME#6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2), + (ins i64mem:$ptr, GR32:$val1, GR32:$val2), + !strconcat(mnemonic, "6432 PSEUDO!"), []>; +} + +defm ATOMAND : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMAND">; +defm ATOMOR : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMOR">; +defm ATOMXOR : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMXOR">; +defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMNAND">; +defm ATOMADD : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMADD">; +defm ATOMSUB : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSUB">; +defm ATOMMAX : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMAX">; +defm ATOMMIN : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMIN">; +defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMAX">; +defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMIN">; +defm ATOMSWAP : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSWAP">; + +//===----------------------------------------------------------------------===// +// Normal-Instructions-With-Lock-Prefix Pseudo Instructions +//===----------------------------------------------------------------------===// + +// FIXME: Use normal instructions and add lock prefix dynamically. + +// Memory barriers + +// TODO: Get this to fold the constant into the instruction. +let isCodeGenOnly = 1, Defs = [EFLAGS] in +def OR32mrLocked : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero), + "or{l}\t{$zero, $dst|$dst, $zero}", + [], IIC_ALU_MEM>, Requires<[In32BitMode]>, LOCK, + Sched<[WriteALULd, WriteRMW]>; + +let hasSideEffects = 1 in +def Int_MemBarrier : I<0, Pseudo, (outs), (ins), + "#MEMBARRIER", + [(X86MemBarrier)]>, Sched<[WriteLoad]>; + +// RegOpc corresponds to the mr version of the instruction +// ImmOpc corresponds to the mi version of the instruction +// ImmOpc8 corresponds to the mi8 version of the instruction +// ImmMod corresponds to the instruction format of the mi and mi8 versions +multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8, + Format ImmMod, string mnemonic> { +let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, + SchedRW = [WriteALULd, WriteRMW] in { + +def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 }, + MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2), + !strconcat(mnemonic, "{b}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_NONMEM>, LOCK; +def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), + !strconcat(mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_NONMEM>, OpSize, LOCK; +def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), + !strconcat(mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_NONMEM>, LOCK; +def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), + !strconcat(mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_NONMEM>, LOCK; + +def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 }, + ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2), + !strconcat(mnemonic, "{b}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, LOCK; + +def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2), + !strconcat(mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, OpSize, LOCK; + +def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2), + !strconcat(mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, LOCK; + +def NAME#64mi32 : RIi32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2), + !strconcat(mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, LOCK; + +def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2), + !strconcat(mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, OpSize, LOCK; +def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2), + !strconcat(mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, LOCK; +def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2), + !strconcat(mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, LOCK; + +} + +} + +defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, "add">; +defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, "sub">; +defm LOCK_OR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, "or">; +defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, "and">; +defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">; + +// Optimized codegen when the non-memory output is not used. +multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form, + string mnemonic> { +let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, + SchedRW = [WriteALULd, WriteRMW] in { + +def NAME#8m : I<Opc8, Form, (outs), (ins i8mem :$dst), + !strconcat(mnemonic, "{b}\t$dst"), + [], IIC_UNARY_MEM>, LOCK; +def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst), + !strconcat(mnemonic, "{w}\t$dst"), + [], IIC_UNARY_MEM>, OpSize, LOCK; +def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst), + !strconcat(mnemonic, "{l}\t$dst"), + [], IIC_UNARY_MEM>, LOCK; +def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst), + !strconcat(mnemonic, "{q}\t$dst"), + [], IIC_UNARY_MEM>, LOCK; +} +} + +defm LOCK_INC : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "inc">; +defm LOCK_DEC : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "dec">; + +// Atomic compare and swap. +multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic, + SDPatternOperator frag, X86MemOperand x86memop, + InstrItinClass itin> { +let isCodeGenOnly = 1 in { + def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr), + !strconcat(mnemonic, "\t$ptr"), + [(frag addr:$ptr)], itin>, TB, LOCK; +} +} + +multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form, + string mnemonic, SDPatternOperator frag, + InstrItinClass itin8, InstrItinClass itin> { +let isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in { + let Defs = [AL, EFLAGS], Uses = [AL] in + def NAME#8 : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap), + !strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"), + [(frag addr:$ptr, GR8:$swap, 1)], itin8>, TB, LOCK; + let Defs = [AX, EFLAGS], Uses = [AX] in + def NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap), + !strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"), + [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize, LOCK; + let Defs = [EAX, EFLAGS], Uses = [EAX] in + def NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap), + !strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"), + [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, LOCK; + let Defs = [RAX, EFLAGS], Uses = [RAX] in + def NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap), + !strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"), + [(frag addr:$ptr, GR64:$swap, 8)], itin>, TB, LOCK; +} +} + +let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX], + SchedRW = [WriteALULd, WriteRMW] in { +defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", + X86cas8, i64mem, + IIC_CMPX_LOCK_8B>; +} + +let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX], + Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in { +defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b", + X86cas16, i128mem, + IIC_CMPX_LOCK_16B>, REX_W; +} + +defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg", + X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>; + +// Atomic exchange and add +multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic, + string frag, + InstrItinClass itin8, InstrItinClass itin> { + let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1, + SchedRW = [WriteALULd, WriteRMW] in { + def NAME#8 : I<opc8, MRMSrcMem, (outs GR8:$dst), + (ins GR8:$val, i8mem:$ptr), + !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"), + [(set GR8:$dst, + (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))], + itin8>; + def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst), + (ins GR16:$val, i16mem:$ptr), + !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"), + [(set + GR16:$dst, + (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))], + itin>, OpSize; + def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$val, i32mem:$ptr), + !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"), + [(set + GR32:$dst, + (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))], + itin>; + def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$val, i64mem:$ptr), + !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"), + [(set + GR64:$dst, + (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))], + itin>; + } +} + +defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add", + IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>, + TB, LOCK; + +def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR8:$dst, (atomic_load_8 addr:$src))]>; +def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR16:$dst, (atomic_load_16 addr:$src))]>; +def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR32:$dst, (atomic_load_32 addr:$src))]>; +def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR64:$dst, (atomic_load_64 addr:$src))]>; + +def RELEASE_MOV8mr : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src), + "#RELEASE_MOV PSEUDO!", + [(atomic_store_8 addr:$dst, GR8 :$src)]>; +def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src), + "#RELEASE_MOV PSEUDO!", + [(atomic_store_16 addr:$dst, GR16:$src)]>; +def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src), + "#RELEASE_MOV PSEUDO!", + [(atomic_store_32 addr:$dst, GR32:$src)]>; +def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src), + "#RELEASE_MOV PSEUDO!", + [(atomic_store_64 addr:$dst, GR64:$src)]>; + +//===----------------------------------------------------------------------===// +// Conditional Move Pseudo Instructions. +//===----------------------------------------------------------------------===// + + +// CMOV* - Used to implement the SSE SELECT DAG operation. Expanded after +// instruction selection into a branch sequence. +let Uses = [EFLAGS], usesCustomInserter = 1 in { + def CMOV_FR32 : I<0, Pseudo, + (outs FR32:$dst), (ins FR32:$t, FR32:$f, i8imm:$cond), + "#CMOV_FR32 PSEUDO!", + [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond, + EFLAGS))]>; + def CMOV_FR64 : I<0, Pseudo, + (outs FR64:$dst), (ins FR64:$t, FR64:$f, i8imm:$cond), + "#CMOV_FR64 PSEUDO!", + [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond, + EFLAGS))]>; + def CMOV_V4F32 : I<0, Pseudo, + (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), + "#CMOV_V4F32 PSEUDO!", + [(set VR128:$dst, + (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond, + EFLAGS)))]>; + def CMOV_V2F64 : I<0, Pseudo, + (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), + "#CMOV_V2F64 PSEUDO!", + [(set VR128:$dst, + (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond, + EFLAGS)))]>; + def CMOV_V2I64 : I<0, Pseudo, + (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), + "#CMOV_V2I64 PSEUDO!", + [(set VR128:$dst, + (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond, + EFLAGS)))]>; + def CMOV_V8F32 : I<0, Pseudo, + (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), + "#CMOV_V8F32 PSEUDO!", + [(set VR256:$dst, + (v8f32 (X86cmov VR256:$t, VR256:$f, imm:$cond, + EFLAGS)))]>; + def CMOV_V4F64 : I<0, Pseudo, + (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), + "#CMOV_V4F64 PSEUDO!", + [(set VR256:$dst, + (v4f64 (X86cmov VR256:$t, VR256:$f, imm:$cond, + EFLAGS)))]>; + def CMOV_V4I64 : I<0, Pseudo, + (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), + "#CMOV_V4I64 PSEUDO!", + [(set VR256:$dst, + (v4i64 (X86cmov VR256:$t, VR256:$f, imm:$cond, + EFLAGS)))]>; +} + + +//===----------------------------------------------------------------------===// +// DAG Pattern Matching Rules +//===----------------------------------------------------------------------===// + +// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable +def : Pat<(i32 (X86Wrapper tconstpool :$dst)), (MOV32ri tconstpool :$dst)>; +def : Pat<(i32 (X86Wrapper tjumptable :$dst)), (MOV32ri tjumptable :$dst)>; +def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)),(MOV32ri tglobaltlsaddr:$dst)>; +def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>; +def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>; +def : Pat<(i32 (X86Wrapper tblockaddress:$dst)), (MOV32ri tblockaddress:$dst)>; + +def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)), + (ADD32ri GR32:$src1, tconstpool:$src2)>; +def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)), + (ADD32ri GR32:$src1, tjumptable:$src2)>; +def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)), + (ADD32ri GR32:$src1, tglobaladdr:$src2)>; +def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)), + (ADD32ri GR32:$src1, texternalsym:$src2)>; +def : Pat<(add GR32:$src1, (X86Wrapper tblockaddress:$src2)), + (ADD32ri GR32:$src1, tblockaddress:$src2)>; + +def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst), + (MOV32mi addr:$dst, tglobaladdr:$src)>; +def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst), + (MOV32mi addr:$dst, texternalsym:$src)>; +def : Pat<(store (i32 (X86Wrapper tblockaddress:$src)), addr:$dst), + (MOV32mi addr:$dst, tblockaddress:$src)>; + + + +// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable when not in small +// code model mode, should use 'movabs'. FIXME: This is really a hack, the +// 'movabs' predicate should handle this sort of thing. +def : Pat<(i64 (X86Wrapper tconstpool :$dst)), + (MOV64ri tconstpool :$dst)>, Requires<[FarData]>; +def : Pat<(i64 (X86Wrapper tjumptable :$dst)), + (MOV64ri tjumptable :$dst)>, Requires<[FarData]>; +def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)), + (MOV64ri tglobaladdr :$dst)>, Requires<[FarData]>; +def : Pat<(i64 (X86Wrapper texternalsym:$dst)), + (MOV64ri texternalsym:$dst)>, Requires<[FarData]>; +def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), + (MOV64ri tblockaddress:$dst)>, Requires<[FarData]>; + +// In static codegen with small code model, we can get the address of a label +// into a register with 'movl'. FIXME: This is a hack, the 'imm' predicate of +// the MOV64ri64i32 should accept these. +def : Pat<(i64 (X86Wrapper tconstpool :$dst)), + (MOV64ri64i32 tconstpool :$dst)>, Requires<[SmallCode]>; +def : Pat<(i64 (X86Wrapper tjumptable :$dst)), + (MOV64ri64i32 tjumptable :$dst)>, Requires<[SmallCode]>; +def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)), + (MOV64ri64i32 tglobaladdr :$dst)>, Requires<[SmallCode]>; +def : Pat<(i64 (X86Wrapper texternalsym:$dst)), + (MOV64ri64i32 texternalsym:$dst)>, Requires<[SmallCode]>; +def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), + (MOV64ri64i32 tblockaddress:$dst)>, Requires<[SmallCode]>; + +// In kernel code model, we can get the address of a label +// into a register with 'movq'. FIXME: This is a hack, the 'imm' predicate of +// the MOV64ri32 should accept these. +def : Pat<(i64 (X86Wrapper tconstpool :$dst)), + (MOV64ri32 tconstpool :$dst)>, Requires<[KernelCode]>; +def : Pat<(i64 (X86Wrapper tjumptable :$dst)), + (MOV64ri32 tjumptable :$dst)>, Requires<[KernelCode]>; +def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)), + (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>; +def : Pat<(i64 (X86Wrapper texternalsym:$dst)), + (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>; +def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), + (MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>; + +// If we have small model and -static mode, it is safe to store global addresses +// directly as immediates. FIXME: This is really a hack, the 'imm' predicate +// for MOV64mi32 should handle this sort of thing. +def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tconstpool:$src)>, + Requires<[NearData, IsStatic]>; +def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tjumptable:$src)>, + Requires<[NearData, IsStatic]>; +def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tglobaladdr:$src)>, + Requires<[NearData, IsStatic]>; +def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst), + (MOV64mi32 addr:$dst, texternalsym:$src)>, + Requires<[NearData, IsStatic]>; +def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tblockaddress:$src)>, + Requires<[NearData, IsStatic]>; + + + +// Calls + +// tls has some funny stuff here... +// This corresponds to movabs $foo@tpoff, %rax +def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)), + (MOV64ri tglobaltlsaddr :$dst)>; +// This corresponds to add $foo@tpoff, %rax +def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)), + (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>; + + +// Direct PC relative function call for small code model. 32-bit displacement +// sign extended to 64-bit. +def : Pat<(X86call (i64 tglobaladdr:$dst)), + (CALL64pcrel32 tglobaladdr:$dst)>; +def : Pat<(X86call (i64 texternalsym:$dst)), + (CALL64pcrel32 texternalsym:$dst)>; + +// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they +// can never use callee-saved registers. That is the purpose of the GR64_TC +// register classes. +// +// The only volatile register that is never used by the calling convention is +// %r11. This happens when calling a vararg function with 6 arguments. +// +// Match an X86tcret that uses less than 7 volatile registers. +def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off), + (X86tcret node:$ptr, node:$off), [{ + // X86tcret args: (*chain, ptr, imm, regs..., glue) + unsigned NumRegs = 0; + for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i) + if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6) + return false; + return true; +}]>; + +def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), + (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>, + Requires<[In32BitMode]>; + +// FIXME: This is disabled for 32-bit PIC mode because the global base +// register which is part of the address mode may be assigned a +// callee-saved register. +def : Pat<(X86tcret (load addr:$dst), imm:$off), + (TCRETURNmi addr:$dst, imm:$off)>, + Requires<[In32BitMode, IsNotPIC]>; + +def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off), + (TCRETURNdi texternalsym:$dst, imm:$off)>, + Requires<[In32BitMode]>; + +def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off), + (TCRETURNdi texternalsym:$dst, imm:$off)>, + Requires<[In32BitMode]>; + +def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), + (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>, + Requires<[In64BitMode]>; + +// Don't fold loads into X86tcret requiring more than 6 regs. +// There wouldn't be enough scratch registers for base+index. +def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off), + (TCRETURNmi64 addr:$dst, imm:$off)>, + Requires<[In64BitMode]>; + +def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off), + (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>, + Requires<[In64BitMode]>; + +def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off), + (TCRETURNdi64 texternalsym:$dst, imm:$off)>, + Requires<[In64BitMode]>; + +// Normal calls, with various flavors of addresses. +def : Pat<(X86call (i32 tglobaladdr:$dst)), + (CALLpcrel32 tglobaladdr:$dst)>; +def : Pat<(X86call (i32 texternalsym:$dst)), + (CALLpcrel32 texternalsym:$dst)>; +def : Pat<(X86call (i32 imm:$dst)), + (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>; + +// Comparisons. + +// TEST R,R is smaller than CMP R,0 +def : Pat<(X86cmp GR8:$src1, 0), + (TEST8rr GR8:$src1, GR8:$src1)>; +def : Pat<(X86cmp GR16:$src1, 0), + (TEST16rr GR16:$src1, GR16:$src1)>; +def : Pat<(X86cmp GR32:$src1, 0), + (TEST32rr GR32:$src1, GR32:$src1)>; +def : Pat<(X86cmp GR64:$src1, 0), + (TEST64rr GR64:$src1, GR64:$src1)>; + +// Conditional moves with folded loads with operands swapped and conditions +// inverted. +multiclass CMOVmr<PatLeaf InvertedCond, Instruction Inst16, Instruction Inst32, + Instruction Inst64> { + let Predicates = [HasCMov] in { + def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS), + (Inst16 GR16:$src2, addr:$src1)>; + def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS), + (Inst32 GR32:$src2, addr:$src1)>; + def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS), + (Inst64 GR64:$src2, addr:$src1)>; + } +} + +defm : CMOVmr<X86_COND_B , CMOVAE16rm, CMOVAE32rm, CMOVAE64rm>; +defm : CMOVmr<X86_COND_AE, CMOVB16rm , CMOVB32rm , CMOVB64rm>; +defm : CMOVmr<X86_COND_E , CMOVNE16rm, CMOVNE32rm, CMOVNE64rm>; +defm : CMOVmr<X86_COND_NE, CMOVE16rm , CMOVE32rm , CMOVE64rm>; +defm : CMOVmr<X86_COND_BE, CMOVA16rm , CMOVA32rm , CMOVA64rm>; +defm : CMOVmr<X86_COND_A , CMOVBE16rm, CMOVBE32rm, CMOVBE64rm>; +defm : CMOVmr<X86_COND_L , CMOVGE16rm, CMOVGE32rm, CMOVGE64rm>; +defm : CMOVmr<X86_COND_GE, CMOVL16rm , CMOVL32rm , CMOVL64rm>; +defm : CMOVmr<X86_COND_LE, CMOVG16rm , CMOVG32rm , CMOVG64rm>; +defm : CMOVmr<X86_COND_G , CMOVLE16rm, CMOVLE32rm, CMOVLE64rm>; +defm : CMOVmr<X86_COND_P , CMOVNP16rm, CMOVNP32rm, CMOVNP64rm>; +defm : CMOVmr<X86_COND_NP, CMOVP16rm , CMOVP32rm , CMOVP64rm>; +defm : CMOVmr<X86_COND_S , CMOVNS16rm, CMOVNS32rm, CMOVNS64rm>; +defm : CMOVmr<X86_COND_NS, CMOVS16rm , CMOVS32rm , CMOVS64rm>; +defm : CMOVmr<X86_COND_O , CMOVNO16rm, CMOVNO32rm, CMOVNO64rm>; +defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>; + +// zextload bool -> zextload byte +def : Pat<(zextloadi8i1 addr:$src), (MOV8rm addr:$src)>; +def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>; +def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>; +def : Pat<(zextloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>; + +// extload bool -> extload byte +// When extloading from 16-bit and smaller memory locations into 64-bit +// registers, use zero-extending loads so that the entire 64-bit register is +// defined, avoiding partial-register updates. + +def : Pat<(extloadi8i1 addr:$src), (MOV8rm addr:$src)>; +def : Pat<(extloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>; +def : Pat<(extloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>; +def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>; +def : Pat<(extloadi32i8 addr:$src), (MOVZX32rm8 addr:$src)>; +def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>; + +def : Pat<(extloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>; +def : Pat<(extloadi64i8 addr:$src), (MOVZX64rm8 addr:$src)>; +def : Pat<(extloadi64i16 addr:$src), (MOVZX64rm16 addr:$src)>; +// For other extloads, use subregs, since the high contents of the register are +// defined after an extload. +def : Pat<(extloadi64i32 addr:$src), + (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), + sub_32bit)>; + +// anyext. Define these to do an explicit zero-extend to +// avoid partial-register updates. +def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG + (MOVZX32rr8 GR8 :$src), sub_16bit)>; +def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>; + +// Except for i16 -> i32 since isel expect i16 ops to be promoted to i32. +def : Pat<(i32 (anyext GR16:$src)), + (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>; + +def : Pat<(i64 (anyext GR8 :$src)), (MOVZX64rr8 GR8 :$src)>; +def : Pat<(i64 (anyext GR16:$src)), (MOVZX64rr16 GR16 :$src)>; +def : Pat<(i64 (anyext GR32:$src)), + (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>; + + +// Any instruction that defines a 32-bit result leaves the high half of the +// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may +// be copying from a truncate. And x86's cmov doesn't do anything if the +// condition is false. But any other 32-bit operation will zero-extend +// up to 64 bits. +def def32 : PatLeaf<(i32 GR32:$src), [{ + return N->getOpcode() != ISD::TRUNCATE && + N->getOpcode() != TargetOpcode::EXTRACT_SUBREG && + N->getOpcode() != ISD::CopyFromReg && + N->getOpcode() != X86ISD::CMOV; +}]>; + +// In the case of a 32-bit def that is known to implicitly zero-extend, +// we can use a SUBREG_TO_REG. +def : Pat<(i64 (zext def32:$src)), + (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>; + +//===----------------------------------------------------------------------===// +// Pattern match OR as ADD +//===----------------------------------------------------------------------===// + +// If safe, we prefer to pattern match OR as ADD at isel time. ADD can be +// 3-addressified into an LEA instruction to avoid copies. However, we also +// want to finally emit these instructions as an or at the end of the code +// generator to make the generated code easier to read. To do this, we select +// into "disjoint bits" pseudo ops. + +// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero. +def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{ + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1))) + return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue()); + + APInt KnownZero0, KnownOne0; + CurDAG->ComputeMaskedBits(N->getOperand(0), KnownZero0, KnownOne0, 0); + APInt KnownZero1, KnownOne1; + CurDAG->ComputeMaskedBits(N->getOperand(1), KnownZero1, KnownOne1, 0); + return (~KnownZero0 & ~KnownZero1) == 0; +}]>; + + +// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits. +// Try this before the selecting to OR. +let AddedComplexity = 5, SchedRW = [WriteALU] in { + +let isConvertibleToThreeAddress = 1, + Constraints = "$src1 = $dst", Defs = [EFLAGS] in { +let isCommutable = 1 in { +def ADD16rr_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), + "", // orw/addw REG, REG + [(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>; +def ADD32rr_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), + "", // orl/addl REG, REG + [(set GR32:$dst, (or_is_add GR32:$src1, GR32:$src2))]>; +def ADD64rr_DB : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), + "", // orq/addq REG, REG + [(set GR64:$dst, (or_is_add GR64:$src1, GR64:$src2))]>; +} // isCommutable + +// NOTE: These are order specific, we want the ri8 forms to be listed +// first so that they are slightly preferred to the ri forms. + +def ADD16ri8_DB : I<0, Pseudo, + (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), + "", // orw/addw REG, imm8 + [(set GR16:$dst,(or_is_add GR16:$src1,i16immSExt8:$src2))]>; +def ADD16ri_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), + "", // orw/addw REG, imm + [(set GR16:$dst, (or_is_add GR16:$src1, imm:$src2))]>; + +def ADD32ri8_DB : I<0, Pseudo, + (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), + "", // orl/addl REG, imm8 + [(set GR32:$dst,(or_is_add GR32:$src1,i32immSExt8:$src2))]>; +def ADD32ri_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), + "", // orl/addl REG, imm + [(set GR32:$dst, (or_is_add GR32:$src1, imm:$src2))]>; + + +def ADD64ri8_DB : I<0, Pseudo, + (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), + "", // orq/addq REG, imm8 + [(set GR64:$dst, (or_is_add GR64:$src1, + i64immSExt8:$src2))]>; +def ADD64ri32_DB : I<0, Pseudo, + (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), + "", // orq/addq REG, imm + [(set GR64:$dst, (or_is_add GR64:$src1, + i64immSExt32:$src2))]>; +} +} // AddedComplexity, SchedRW + + +//===----------------------------------------------------------------------===// +// Some peepholes +//===----------------------------------------------------------------------===// + +// Odd encoding trick: -128 fits into an 8-bit immediate field while +// +128 doesn't, so in this special case use a sub instead of an add. +def : Pat<(add GR16:$src1, 128), + (SUB16ri8 GR16:$src1, -128)>; +def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst), + (SUB16mi8 addr:$dst, -128)>; + +def : Pat<(add GR32:$src1, 128), + (SUB32ri8 GR32:$src1, -128)>; +def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst), + (SUB32mi8 addr:$dst, -128)>; + +def : Pat<(add GR64:$src1, 128), + (SUB64ri8 GR64:$src1, -128)>; +def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst), + (SUB64mi8 addr:$dst, -128)>; + +// The same trick applies for 32-bit immediate fields in 64-bit +// instructions. +def : Pat<(add GR64:$src1, 0x0000000080000000), + (SUB64ri32 GR64:$src1, 0xffffffff80000000)>; +def : Pat<(store (add (loadi64 addr:$dst), 0x00000000800000000), addr:$dst), + (SUB64mi32 addr:$dst, 0xffffffff80000000)>; + +// To avoid needing to materialize an immediate in a register, use a 32-bit and +// with implicit zero-extension instead of a 64-bit and if the immediate has at +// least 32 bits of leading zeros. If in addition the last 32 bits can be +// represented with a sign extension of a 8 bit constant, use that. + +def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm), + (SUBREG_TO_REG + (i64 0), + (AND32ri8 + (EXTRACT_SUBREG GR64:$src, sub_32bit), + (i32 (GetLo8XForm imm:$imm))), + sub_32bit)>; + +def : Pat<(and GR64:$src, i64immZExt32:$imm), + (SUBREG_TO_REG + (i64 0), + (AND32ri + (EXTRACT_SUBREG GR64:$src, sub_32bit), + (i32 (GetLo32XForm imm:$imm))), + sub_32bit)>; + + +// r & (2^16-1) ==> movz +def : Pat<(and GR32:$src1, 0xffff), + (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>; +// r & (2^8-1) ==> movz +def : Pat<(and GR32:$src1, 0xff), + (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src1, + GR32_ABCD)), + sub_8bit))>, + Requires<[In32BitMode]>; +// r & (2^8-1) ==> movz +def : Pat<(and GR16:$src1, 0xff), + (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG + (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)), + sub_16bit)>, + Requires<[In32BitMode]>; + +// r & (2^32-1) ==> movz +def : Pat<(and GR64:$src, 0x00000000FFFFFFFF), + (MOVZX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>; +// r & (2^16-1) ==> movz +def : Pat<(and GR64:$src, 0xffff), + (MOVZX64rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit)))>; +// r & (2^8-1) ==> movz +def : Pat<(and GR64:$src, 0xff), + (MOVZX64rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit)))>; +// r & (2^8-1) ==> movz +def : Pat<(and GR32:$src1, 0xff), + (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>, + Requires<[In64BitMode]>; +// r & (2^8-1) ==> movz +def : Pat<(and GR16:$src1, 0xff), + (EXTRACT_SUBREG (MOVZX32rr8 (i8 + (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>, + Requires<[In64BitMode]>; + + +// sext_inreg patterns +def : Pat<(sext_inreg GR32:$src, i16), + (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>; +def : Pat<(sext_inreg GR32:$src, i8), + (MOVSX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, + GR32_ABCD)), + sub_8bit))>, + Requires<[In32BitMode]>; + +def : Pat<(sext_inreg GR16:$src, i8), + (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG + (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))), + sub_16bit)>, + Requires<[In32BitMode]>; + +def : Pat<(sext_inreg GR64:$src, i32), + (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>; +def : Pat<(sext_inreg GR64:$src, i16), + (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, sub_16bit))>; +def : Pat<(sext_inreg GR64:$src, i8), + (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, sub_8bit))>; +def : Pat<(sext_inreg GR32:$src, i8), + (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>, + Requires<[In64BitMode]>; +def : Pat<(sext_inreg GR16:$src, i8), + (EXTRACT_SUBREG (MOVSX32rr8 + (EXTRACT_SUBREG GR16:$src, sub_8bit)), sub_16bit)>, + Requires<[In64BitMode]>; + +// sext, sext_load, zext, zext_load +def: Pat<(i16 (sext GR8:$src)), + (EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>; +def: Pat<(sextloadi16i8 addr:$src), + (EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>; +def: Pat<(i16 (zext GR8:$src)), + (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>; +def: Pat<(zextloadi16i8 addr:$src), + (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>; + +// trunc patterns +def : Pat<(i16 (trunc GR32:$src)), + (EXTRACT_SUBREG GR32:$src, sub_16bit)>; +def : Pat<(i8 (trunc GR32:$src)), + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), + sub_8bit)>, + Requires<[In32BitMode]>; +def : Pat<(i8 (trunc GR16:$src)), + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit)>, + Requires<[In32BitMode]>; +def : Pat<(i32 (trunc GR64:$src)), + (EXTRACT_SUBREG GR64:$src, sub_32bit)>; +def : Pat<(i16 (trunc GR64:$src)), + (EXTRACT_SUBREG GR64:$src, sub_16bit)>; +def : Pat<(i8 (trunc GR64:$src)), + (EXTRACT_SUBREG GR64:$src, sub_8bit)>; +def : Pat<(i8 (trunc GR32:$src)), + (EXTRACT_SUBREG GR32:$src, sub_8bit)>, + Requires<[In64BitMode]>; +def : Pat<(i8 (trunc GR16:$src)), + (EXTRACT_SUBREG GR16:$src, sub_8bit)>, + Requires<[In64BitMode]>; + +// h-register tricks +def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))), + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi)>, + Requires<[In32BitMode]>; +def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))), + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), + sub_8bit_hi)>, + Requires<[In32BitMode]>; +def : Pat<(srl GR16:$src, (i8 8)), + (EXTRACT_SUBREG + (MOVZX32rr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi)), + sub_16bit)>, + Requires<[In32BitMode]>; +def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))), + (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, + GR16_ABCD)), + sub_8bit_hi))>, + Requires<[In32BitMode]>; +def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))), + (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, + GR16_ABCD)), + sub_8bit_hi))>, + Requires<[In32BitMode]>; +def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)), + (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, + GR32_ABCD)), + sub_8bit_hi))>, + Requires<[In32BitMode]>; +def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)), + (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, + GR32_ABCD)), + sub_8bit_hi))>, + Requires<[In32BitMode]>; + +// h-register tricks. +// For now, be conservative on x86-64 and use an h-register extract only if the +// value is immediately zero-extended or stored, which are somewhat common +// cases. This uses a bunch of code to prevent a register requiring a REX prefix +// from being allocated in the same instruction as the h register, as there's +// currently no way to describe this requirement to the register allocator. + +// h-register extract and zero-extend. +def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)), + (SUBREG_TO_REG + (i64 0), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)), + sub_8bit_hi)), + sub_32bit)>; +def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; +def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)), + (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, + GR32_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; +def : Pat<(srl GR16:$src, (i8 8)), + (EXTRACT_SUBREG + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi)), + sub_16bit)>, + Requires<[In64BitMode]>; +def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; +def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; +def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))), + (SUBREG_TO_REG + (i64 0), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi)), + sub_32bit)>; +def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))), + (SUBREG_TO_REG + (i64 0), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi)), + sub_32bit)>; + +// h-register extract and store. +def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst), + (MOV8mr_NOREX + addr:$dst, + (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)), + sub_8bit_hi))>; +def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst), + (MOV8mr_NOREX + addr:$dst, + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; +def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst), + (MOV8mr_NOREX + addr:$dst, + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; + + +// (shl x, 1) ==> (add x, x) +// Note that if x is undef (immediate or otherwise), we could theoretically +// end up with the two uses of x getting different values, producing a result +// where the least significant bit is not 0. However, the probability of this +// happening is considered low enough that this is officially not a +// "real problem". +def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr GR8 :$src1, GR8 :$src1)>; +def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>; +def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>; +def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>; + +// Helper imms that check if a mask doesn't change significant shift bits. +def immShift32 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 5; }]>; +def immShift64 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 6; }]>; + +// (shl x (and y, 31)) ==> (shl x, y) +def : Pat<(shl GR8:$src1, (and CL, immShift32)), + (SHL8rCL GR8:$src1)>; +def : Pat<(shl GR16:$src1, (and CL, immShift32)), + (SHL16rCL GR16:$src1)>; +def : Pat<(shl GR32:$src1, (and CL, immShift32)), + (SHL32rCL GR32:$src1)>; +def : Pat<(store (shl (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst), + (SHL8mCL addr:$dst)>; +def : Pat<(store (shl (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst), + (SHL16mCL addr:$dst)>; +def : Pat<(store (shl (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst), + (SHL32mCL addr:$dst)>; + +def : Pat<(srl GR8:$src1, (and CL, immShift32)), + (SHR8rCL GR8:$src1)>; +def : Pat<(srl GR16:$src1, (and CL, immShift32)), + (SHR16rCL GR16:$src1)>; +def : Pat<(srl GR32:$src1, (and CL, immShift32)), + (SHR32rCL GR32:$src1)>; +def : Pat<(store (srl (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst), + (SHR8mCL addr:$dst)>; +def : Pat<(store (srl (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst), + (SHR16mCL addr:$dst)>; +def : Pat<(store (srl (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst), + (SHR32mCL addr:$dst)>; + +def : Pat<(sra GR8:$src1, (and CL, immShift32)), + (SAR8rCL GR8:$src1)>; +def : Pat<(sra GR16:$src1, (and CL, immShift32)), + (SAR16rCL GR16:$src1)>; +def : Pat<(sra GR32:$src1, (and CL, immShift32)), + (SAR32rCL GR32:$src1)>; +def : Pat<(store (sra (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst), + (SAR8mCL addr:$dst)>; +def : Pat<(store (sra (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst), + (SAR16mCL addr:$dst)>; +def : Pat<(store (sra (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst), + (SAR32mCL addr:$dst)>; + +// (shl x (and y, 63)) ==> (shl x, y) +def : Pat<(shl GR64:$src1, (and CL, immShift64)), + (SHL64rCL GR64:$src1)>; +def : Pat<(store (shl (loadi64 addr:$dst), (and CL, 63)), addr:$dst), + (SHL64mCL addr:$dst)>; + +def : Pat<(srl GR64:$src1, (and CL, immShift64)), + (SHR64rCL GR64:$src1)>; +def : Pat<(store (srl (loadi64 addr:$dst), (and CL, 63)), addr:$dst), + (SHR64mCL addr:$dst)>; + +def : Pat<(sra GR64:$src1, (and CL, immShift64)), + (SAR64rCL GR64:$src1)>; +def : Pat<(store (sra (loadi64 addr:$dst), (and CL, 63)), addr:$dst), + (SAR64mCL addr:$dst)>; + + +// (anyext (setcc_carry)) -> (setcc_carry) +def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C16r)>; +def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C32r)>; +def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C32r)>; + + + + +//===----------------------------------------------------------------------===// +// EFLAGS-defining Patterns +//===----------------------------------------------------------------------===// + +// add reg, reg +def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>; +def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>; + +// add reg, mem +def : Pat<(add GR8:$src1, (loadi8 addr:$src2)), + (ADD8rm GR8:$src1, addr:$src2)>; +def : Pat<(add GR16:$src1, (loadi16 addr:$src2)), + (ADD16rm GR16:$src1, addr:$src2)>; +def : Pat<(add GR32:$src1, (loadi32 addr:$src2)), + (ADD32rm GR32:$src1, addr:$src2)>; + +// add reg, imm +def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri GR8:$src1 , imm:$src2)>; +def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>; +def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>; +def : Pat<(add GR16:$src1, i16immSExt8:$src2), + (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(add GR32:$src1, i32immSExt8:$src2), + (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>; + +// sub reg, reg +def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>; +def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>; + +// sub reg, mem +def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)), + (SUB8rm GR8:$src1, addr:$src2)>; +def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)), + (SUB16rm GR16:$src1, addr:$src2)>; +def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)), + (SUB32rm GR32:$src1, addr:$src2)>; + +// sub reg, imm +def : Pat<(sub GR8:$src1, imm:$src2), + (SUB8ri GR8:$src1, imm:$src2)>; +def : Pat<(sub GR16:$src1, imm:$src2), + (SUB16ri GR16:$src1, imm:$src2)>; +def : Pat<(sub GR32:$src1, imm:$src2), + (SUB32ri GR32:$src1, imm:$src2)>; +def : Pat<(sub GR16:$src1, i16immSExt8:$src2), + (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(sub GR32:$src1, i32immSExt8:$src2), + (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>; + +// sub 0, reg +def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r GR8 :$src)>; +def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>; +def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>; +def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>; + +// mul reg, reg +def : Pat<(mul GR16:$src1, GR16:$src2), + (IMUL16rr GR16:$src1, GR16:$src2)>; +def : Pat<(mul GR32:$src1, GR32:$src2), + (IMUL32rr GR32:$src1, GR32:$src2)>; + +// mul reg, mem +def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)), + (IMUL16rm GR16:$src1, addr:$src2)>; +def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)), + (IMUL32rm GR32:$src1, addr:$src2)>; + +// mul reg, imm +def : Pat<(mul GR16:$src1, imm:$src2), + (IMUL16rri GR16:$src1, imm:$src2)>; +def : Pat<(mul GR32:$src1, imm:$src2), + (IMUL32rri GR32:$src1, imm:$src2)>; +def : Pat<(mul GR16:$src1, i16immSExt8:$src2), + (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(mul GR32:$src1, i32immSExt8:$src2), + (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>; + +// reg = mul mem, imm +def : Pat<(mul (loadi16 addr:$src1), imm:$src2), + (IMUL16rmi addr:$src1, imm:$src2)>; +def : Pat<(mul (loadi32 addr:$src1), imm:$src2), + (IMUL32rmi addr:$src1, imm:$src2)>; +def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2), + (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>; +def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2), + (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>; + +// Patterns for nodes that do not produce flags, for instructions that do. + +// addition +def : Pat<(add GR64:$src1, GR64:$src2), + (ADD64rr GR64:$src1, GR64:$src2)>; +def : Pat<(add GR64:$src1, i64immSExt8:$src2), + (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(add GR64:$src1, i64immSExt32:$src2), + (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>; +def : Pat<(add GR64:$src1, (loadi64 addr:$src2)), + (ADD64rm GR64:$src1, addr:$src2)>; + +// subtraction +def : Pat<(sub GR64:$src1, GR64:$src2), + (SUB64rr GR64:$src1, GR64:$src2)>; +def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)), + (SUB64rm GR64:$src1, addr:$src2)>; +def : Pat<(sub GR64:$src1, i64immSExt8:$src2), + (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(sub GR64:$src1, i64immSExt32:$src2), + (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>; + +// Multiply +def : Pat<(mul GR64:$src1, GR64:$src2), + (IMUL64rr GR64:$src1, GR64:$src2)>; +def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)), + (IMUL64rm GR64:$src1, addr:$src2)>; +def : Pat<(mul GR64:$src1, i64immSExt8:$src2), + (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(mul GR64:$src1, i64immSExt32:$src2), + (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>; +def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2), + (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>; +def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2), + (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>; + +// Increment reg. +def : Pat<(add GR8 :$src, 1), (INC8r GR8 :$src)>; +def : Pat<(add GR16:$src, 1), (INC16r GR16:$src)>, Requires<[In32BitMode]>; +def : Pat<(add GR16:$src, 1), (INC64_16r GR16:$src)>, Requires<[In64BitMode]>; +def : Pat<(add GR32:$src, 1), (INC32r GR32:$src)>, Requires<[In32BitMode]>; +def : Pat<(add GR32:$src, 1), (INC64_32r GR32:$src)>, Requires<[In64BitMode]>; +def : Pat<(add GR64:$src, 1), (INC64r GR64:$src)>; + +// Decrement reg. +def : Pat<(add GR8 :$src, -1), (DEC8r GR8 :$src)>; +def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>, Requires<[In32BitMode]>; +def : Pat<(add GR16:$src, -1), (DEC64_16r GR16:$src)>, Requires<[In64BitMode]>; +def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>, Requires<[In32BitMode]>; +def : Pat<(add GR32:$src, -1), (DEC64_32r GR32:$src)>, Requires<[In64BitMode]>; +def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>; + +// or reg/reg. +def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>; +def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>; +def : Pat<(or GR64:$src1, GR64:$src2), (OR64rr GR64:$src1, GR64:$src2)>; + +// or reg/mem +def : Pat<(or GR8:$src1, (loadi8 addr:$src2)), + (OR8rm GR8:$src1, addr:$src2)>; +def : Pat<(or GR16:$src1, (loadi16 addr:$src2)), + (OR16rm GR16:$src1, addr:$src2)>; +def : Pat<(or GR32:$src1, (loadi32 addr:$src2)), + (OR32rm GR32:$src1, addr:$src2)>; +def : Pat<(or GR64:$src1, (loadi64 addr:$src2)), + (OR64rm GR64:$src1, addr:$src2)>; + +// or reg/imm +def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri GR8 :$src1, imm:$src2)>; +def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>; +def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>; +def : Pat<(or GR16:$src1, i16immSExt8:$src2), + (OR16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(or GR32:$src1, i32immSExt8:$src2), + (OR32ri8 GR32:$src1, i32immSExt8:$src2)>; +def : Pat<(or GR64:$src1, i64immSExt8:$src2), + (OR64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(or GR64:$src1, i64immSExt32:$src2), + (OR64ri32 GR64:$src1, i64immSExt32:$src2)>; + +// xor reg/reg +def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>; +def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>; +def : Pat<(xor GR64:$src1, GR64:$src2), (XOR64rr GR64:$src1, GR64:$src2)>; + +// xor reg/mem +def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)), + (XOR8rm GR8:$src1, addr:$src2)>; +def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)), + (XOR16rm GR16:$src1, addr:$src2)>; +def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)), + (XOR32rm GR32:$src1, addr:$src2)>; +def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)), + (XOR64rm GR64:$src1, addr:$src2)>; + +// xor reg/imm +def : Pat<(xor GR8:$src1, imm:$src2), + (XOR8ri GR8:$src1, imm:$src2)>; +def : Pat<(xor GR16:$src1, imm:$src2), + (XOR16ri GR16:$src1, imm:$src2)>; +def : Pat<(xor GR32:$src1, imm:$src2), + (XOR32ri GR32:$src1, imm:$src2)>; +def : Pat<(xor GR16:$src1, i16immSExt8:$src2), + (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(xor GR32:$src1, i32immSExt8:$src2), + (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>; +def : Pat<(xor GR64:$src1, i64immSExt8:$src2), + (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(xor GR64:$src1, i64immSExt32:$src2), + (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>; + +// and reg/reg +def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>; +def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>; +def : Pat<(and GR64:$src1, GR64:$src2), (AND64rr GR64:$src1, GR64:$src2)>; + +// and reg/mem +def : Pat<(and GR8:$src1, (loadi8 addr:$src2)), + (AND8rm GR8:$src1, addr:$src2)>; +def : Pat<(and GR16:$src1, (loadi16 addr:$src2)), + (AND16rm GR16:$src1, addr:$src2)>; +def : Pat<(and GR32:$src1, (loadi32 addr:$src2)), + (AND32rm GR32:$src1, addr:$src2)>; +def : Pat<(and GR64:$src1, (loadi64 addr:$src2)), + (AND64rm GR64:$src1, addr:$src2)>; + +// and reg/imm +def : Pat<(and GR8:$src1, imm:$src2), + (AND8ri GR8:$src1, imm:$src2)>; +def : Pat<(and GR16:$src1, imm:$src2), + (AND16ri GR16:$src1, imm:$src2)>; +def : Pat<(and GR32:$src1, imm:$src2), + (AND32ri GR32:$src1, imm:$src2)>; +def : Pat<(and GR16:$src1, i16immSExt8:$src2), + (AND16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(and GR32:$src1, i32immSExt8:$src2), + (AND32ri8 GR32:$src1, i32immSExt8:$src2)>; +def : Pat<(and GR64:$src1, i64immSExt8:$src2), + (AND64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(and GR64:$src1, i64immSExt32:$src2), + (AND64ri32 GR64:$src1, i64immSExt32:$src2)>; + +// Bit scan instruction patterns to match explicit zero-undef behavior. +def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>; +def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>; +def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>; +def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>; +def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>; +def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm/lib/Target/X86/X86InstrControl.td new file mode 100644 index 0000000..0e69651 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td @@ -0,0 +1,287 @@ +//===-- X86InstrControl.td - Control Flow Instructions -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 jump, return, call, and related instructions. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Control Flow Instructions. +// + +// Return instructions. +// +// The X86retflag return instructions are variadic because we may add ST0 and +// ST1 arguments when returning values on the x87 stack. +let isTerminator = 1, isReturn = 1, isBarrier = 1, + hasCtrlDep = 1, FPForm = SpecialFP, SchedRW = [WriteJumpLd] in { + def RET : I <0xC3, RawFrm, (outs), (ins variable_ops), + "ret", + [(X86retflag 0)], IIC_RET>; + def RETW : I <0xC3, RawFrm, (outs), (ins), + "ret{w}", + [], IIC_RET>, OpSize; + def RETI : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops), + "ret\t$amt", + [(X86retflag timm:$amt)], IIC_RET_IMM>; + def RETIW : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt), + "ret{w}\t$amt", + [], IIC_RET_IMM>, OpSize; + def LRETL : I <0xCB, RawFrm, (outs), (ins), + "{l}ret{l|f}", [], IIC_RET>; + def LRETW : I <0xCB, RawFrm, (outs), (ins), + "{l}ret{w|f}", [], IIC_RET>, OpSize; + def LRETQ : RI <0xCB, RawFrm, (outs), (ins), + "{l}ret{q|f}", [], IIC_RET>; + def LRETI : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt), + "{l}ret{l|f}\t$amt", [], IIC_RET>; + def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt), + "{l}ret{w|f}\t$amt", [], IIC_RET>, OpSize; +} + +// Unconditional branches. +let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in { + def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst), + "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>; + def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst), + "jmp\t$dst", [], IIC_JMP_REL>; + // FIXME : Intel syntax for JMP64pcrel32 such that it is not ambiguious + // with JMP_1. + def JMP64pcrel32 : I<0xE9, RawFrm, (outs), (ins brtarget:$dst), + "jmpq\t$dst", [], IIC_JMP_REL>; +} + +// Conditional Branches. +let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in { + multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> { + def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, [], + IIC_Jcc>; + def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget:$dst), asm, + [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>, TB; + } +} + +defm JO : ICBr<0x70, 0x80, "jo\t$dst" , X86_COND_O>; +defm JNO : ICBr<0x71, 0x81, "jno\t$dst" , X86_COND_NO>; +defm JB : ICBr<0x72, 0x82, "jb\t$dst" , X86_COND_B>; +defm JAE : ICBr<0x73, 0x83, "jae\t$dst", X86_COND_AE>; +defm JE : ICBr<0x74, 0x84, "je\t$dst" , X86_COND_E>; +defm JNE : ICBr<0x75, 0x85, "jne\t$dst", X86_COND_NE>; +defm JBE : ICBr<0x76, 0x86, "jbe\t$dst", X86_COND_BE>; +defm JA : ICBr<0x77, 0x87, "ja\t$dst" , X86_COND_A>; +defm JS : ICBr<0x78, 0x88, "js\t$dst" , X86_COND_S>; +defm JNS : ICBr<0x79, 0x89, "jns\t$dst", X86_COND_NS>; +defm JP : ICBr<0x7A, 0x8A, "jp\t$dst" , X86_COND_P>; +defm JNP : ICBr<0x7B, 0x8B, "jnp\t$dst", X86_COND_NP>; +defm JL : ICBr<0x7C, 0x8C, "jl\t$dst" , X86_COND_L>; +defm JGE : ICBr<0x7D, 0x8D, "jge\t$dst", X86_COND_GE>; +defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>; +defm JG : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>; + +// jcx/jecx/jrcx instructions. +let isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in { + // These are the 32-bit versions of this instruction for the asmparser. In + // 32-bit mode, the address size prefix is jcxz and the unprefixed version is + // jecxz. + let Uses = [CX] in + def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), + "jcxz\t$dst", [], IIC_JCXZ>, AdSize, Requires<[In32BitMode]>; + let Uses = [ECX] in + def JECXZ_32 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), + "jecxz\t$dst", [], IIC_JCXZ>, Requires<[In32BitMode]>; + + // J*CXZ instruction: 64-bit versions of this instruction for the asmparser. + // In 64-bit mode, the address size prefix is jecxz and the unprefixed version + // is jrcxz. + let Uses = [ECX] in + def JECXZ_64 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), + "jecxz\t$dst", [], IIC_JCXZ>, AdSize, Requires<[In64BitMode]>; + let Uses = [RCX] in + def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), + "jrcxz\t$dst", [], IIC_JCXZ>, Requires<[In64BitMode]>; +} + +// Indirect branches +let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { + def JMP32r : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst", + [(brind GR32:$dst)], IIC_JMP_REG>, Requires<[In32BitMode]>, + Sched<[WriteJump]>; + def JMP32m : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst", + [(brind (loadi32 addr:$dst))], IIC_JMP_MEM>, + Requires<[In32BitMode]>, Sched<[WriteJumpLd]>; + + def JMP64r : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst", + [(brind GR64:$dst)], IIC_JMP_REG>, Requires<[In64BitMode]>, + Sched<[WriteJump]>; + def JMP64m : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst", + [(brind (loadi64 addr:$dst))], IIC_JMP_MEM>, + Requires<[In64BitMode]>, Sched<[WriteJumpLd]>; + + def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs), + (ins i16imm:$off, i16imm:$seg), + "ljmp{w}\t{$seg, $off|$off, $seg}", [], + IIC_JMP_FAR_PTR>, OpSize, Sched<[WriteJump]>; + def FARJMP32i : Iseg32<0xEA, RawFrmImm16, (outs), + (ins i32imm:$off, i16imm:$seg), + "ljmp{l}\t{$seg, $off|$off, $seg}", [], + IIC_JMP_FAR_PTR>, Sched<[WriteJump]>; + def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst), + "ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>, + Sched<[WriteJump]>; + + def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst), + "ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize, + Sched<[WriteJumpLd]>; + def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst), + "ljmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>, + Sched<[WriteJumpLd]>; +} + + +// Loop instructions +let SchedRW = [WriteJump] in { +def LOOP : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", [], IIC_LOOP>; +def LOOPE : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", [], IIC_LOOPE>; +def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", [], IIC_LOOPNE>; +} + +//===----------------------------------------------------------------------===// +// Call Instructions... +// +let isCall = 1 in + // All calls clobber the non-callee saved registers. ESP is marked as + // a use to prevent stack-pointer assignments that appear immediately + // before calls from potentially appearing dead. Uses for argument + // registers are added manually. + let Uses = [ESP] in { + def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm, + (outs), (ins i32imm_pcrel:$dst), + "call{l}\t$dst", [], IIC_CALL_RI>, + Requires<[In32BitMode]>, Sched<[WriteJump]>; + def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst), + "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>, + Requires<[In32BitMode]>, Sched<[WriteJump]>; + def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst), + "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))], + IIC_CALL_MEM>, + Requires<[In32BitMode,FavorMemIndirectCall]>, + Sched<[WriteJumpLd]>; + + def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs), + (ins i16imm:$off, i16imm:$seg), + "lcall{w}\t{$seg, $off|$off, $seg}", [], + IIC_CALL_FAR_PTR>, OpSize, Sched<[WriteJump]>; + def FARCALL32i : Iseg32<0x9A, RawFrmImm16, (outs), + (ins i32imm:$off, i16imm:$seg), + "lcall{l}\t{$seg, $off|$off, $seg}", [], + IIC_CALL_FAR_PTR>, Sched<[WriteJump]>; + + def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst), + "lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize, + Sched<[WriteJumpLd]>; + def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst), + "lcall{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>, + Sched<[WriteJumpLd]>; + + // callw for 16 bit code for the assembler. + let isAsmParserOnly = 1 in + def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm, + (outs), (ins i16imm_pcrel:$dst), + "callw\t$dst", []>, OpSize; + } + + +// Tail call stuff. + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, + isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in + let Uses = [ESP] in { + def TCRETURNdi : PseudoI<(outs), + (ins i32imm_pcrel:$dst, i32imm:$offset), []>; + def TCRETURNri : PseudoI<(outs), + (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>; + let mayLoad = 1 in + def TCRETURNmi : PseudoI<(outs), + (ins i32mem_TC:$dst, i32imm:$offset), []>; + + // FIXME: The should be pseudo instructions that are lowered when going to + // mcinst. + def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs), + (ins i32imm_pcrel:$dst), + "jmp\t$dst # TAILCALL", + [], IIC_JMP_REL>; + def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), + "", [], IIC_JMP_REG>; // FIXME: Remove encoding when JIT is dead. + let mayLoad = 1 in + def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst), + "jmp{l}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>; +} + + +//===----------------------------------------------------------------------===// +// Call Instructions... +// + +// RSP is marked as a use to prevent stack-pointer assignments that appear +// immediately before calls from potentially appearing dead. Uses for argument +// registers are added manually. +let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in { + // NOTE: this pattern doesn't match "X86call imm", because we do not know + // that the offset between an arbitrary immediate and the call will fit in + // the 32-bit pcrel field that we have. + def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm, + (outs), (ins i64i32imm_pcrel:$dst), + "call{q}\t$dst", [], IIC_CALL_RI>, + Requires<[In64BitMode]>; + def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst), + "call{q}\t{*}$dst", [(X86call GR64:$dst)], + IIC_CALL_RI>, + Requires<[In64BitMode]>; + def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst), + "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))], + IIC_CALL_MEM>, + Requires<[In64BitMode,FavorMemIndirectCall]>; + + def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst), + "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>; +} + +let isCall = 1, isCodeGenOnly = 1 in + // __chkstk(MSVC): clobber R10, R11 and EFLAGS. + // ___chkstk(Mingw64): clobber R10, R11, RAX and EFLAGS, and update RSP. + let Defs = [RAX, R10, R11, RSP, EFLAGS], + Uses = [RSP] in { + def W64ALLOCA : Ii32PCRel<0xE8, RawFrm, + (outs), (ins i64i32imm_pcrel:$dst), + "call{q}\t$dst", [], IIC_CALL_RI>, + Requires<[IsWin64]>, Sched<[WriteJump]>; + } + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, + isCodeGenOnly = 1, Uses = [RSP], usesCustomInserter = 1, + SchedRW = [WriteJump] in { + def TCRETURNdi64 : PseudoI<(outs), + (ins i64i32imm_pcrel:$dst, i32imm:$offset), + []>; + def TCRETURNri64 : PseudoI<(outs), + (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>; + let mayLoad = 1 in + def TCRETURNmi64 : PseudoI<(outs), + (ins i64mem_TC:$dst, i32imm:$offset), []>; + + def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), + (ins i64i32imm_pcrel:$dst), + "jmp\t$dst # TAILCALL", [], IIC_JMP_REL>; + def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), + "jmp{q}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>; + + let mayLoad = 1 in + def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst), + "jmp{q}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>; +} diff --git a/contrib/llvm/lib/Target/X86/X86InstrExtension.td b/contrib/llvm/lib/Target/X86/X86InstrExtension.td new file mode 100644 index 0000000..6dc7175 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrExtension.td @@ -0,0 +1,186 @@ +//===-- X86InstrExtension.td - Sign and Zero Extensions ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the sign and zero extension operations. +// +//===----------------------------------------------------------------------===// + +let neverHasSideEffects = 1 in { + let Defs = [AX], Uses = [AL] in + def CBW : I<0x98, RawFrm, (outs), (ins), + "{cbtw|cbw}", []>, OpSize; // AX = signext(AL) + let Defs = [EAX], Uses = [AX] in + def CWDE : I<0x98, RawFrm, (outs), (ins), + "{cwtl|cwde}", []>; // EAX = signext(AX) + + let Defs = [AX,DX], Uses = [AX] in + def CWD : I<0x99, RawFrm, (outs), (ins), + "{cwtd|cwd}", []>, OpSize; // DX:AX = signext(AX) + let Defs = [EAX,EDX], Uses = [EAX] in + def CDQ : I<0x99, RawFrm, (outs), (ins), + "{cltd|cdq}", []>; // EDX:EAX = signext(EAX) + + + let Defs = [RAX], Uses = [EAX] in + def CDQE : RI<0x98, RawFrm, (outs), (ins), + "{cltq|cdqe}", []>; // RAX = signext(EAX) + + let Defs = [RAX,RDX], Uses = [RAX] in + def CQO : RI<0x99, RawFrm, (outs), (ins), + "{cqto|cqo}", []>; // RDX:RAX = signext(RAX) +} + + + +// Sign/Zero extenders +let neverHasSideEffects = 1 in { +def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), + "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_R8>, + TB, OpSize, Sched<[WriteALU]>; +let mayLoad = 1 in +def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), + "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_M8>, + TB, OpSize, Sched<[WriteALULd]>; +} // neverHasSideEffects = 1 +def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src), + "movs{bl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (sext GR8:$src))], IIC_MOVSX>, TB, + Sched<[WriteALU]>; +def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src), + "movs{bl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (sextloadi32i8 addr:$src))], IIC_MOVSX>, TB, + Sched<[WriteALULd]>; +def MOVSX32rr16: I<0xBF, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src), + "movs{wl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (sext GR16:$src))], IIC_MOVSX>, TB, + Sched<[WriteALU]>; +def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), + "movs{wl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (sextloadi32i16 addr:$src))], IIC_MOVSX>, + TB, Sched<[WriteALULd]>; + +let neverHasSideEffects = 1 in { +def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), + "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_R8>, + TB, OpSize, Sched<[WriteALU]>; +let mayLoad = 1 in +def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), + "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_M8>, + TB, OpSize, Sched<[WriteALULd]>; +} // neverHasSideEffects = 1 +def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src), + "movz{bl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (zext GR8:$src))], IIC_MOVZX>, TB, + Sched<[WriteALU]>; +def MOVZX32rm8 : I<0xB6, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src), + "movz{bl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (zextloadi32i8 addr:$src))], IIC_MOVZX>, TB, + Sched<[WriteALULd]>; +def MOVZX32rr16: I<0xB7, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src), + "movz{wl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (zext GR16:$src))], IIC_MOVZX>, TB, + Sched<[WriteALU]>; +def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), + "movz{wl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (zextloadi32i16 addr:$src))], IIC_MOVZX>, + TB, Sched<[WriteALULd]>; + +// These are the same as the regular MOVZX32rr8 and MOVZX32rm8 +// except that they use GR32_NOREX for the output operand register class +// instead of GR32. This allows them to operate on h registers on x86-64. +let neverHasSideEffects = 1, isCodeGenOnly = 1 in { +def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg, + (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src), + "movz{bl|x}\t{$src, $dst|$dst, $src}", + [], IIC_MOVZX>, TB, Sched<[WriteALU]>; +let mayLoad = 1 in +def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem, + (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src), + "movz{bl|x}\t{$src, $dst|$dst, $src}", + [], IIC_MOVZX>, TB, Sched<[WriteALULd]>; +} + +// MOVSX64rr8 always has a REX prefix and it has an 8-bit register +// operand, which makes it a rare instruction with an 8-bit register +// operand that can never access an h register. If support for h registers +// were generalized, this would require a special register class. +def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src), + "movs{bq|x}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sext GR8:$src))], IIC_MOVSX>, TB, + Sched<[WriteALU]>; +def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src), + "movs{bq|x}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sextloadi64i8 addr:$src))], IIC_MOVSX>, + TB, Sched<[WriteALULd]>; +def MOVSX64rr16: RI<0xBF, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src), + "movs{wq|x}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sext GR16:$src))], IIC_MOVSX>, TB, + Sched<[WriteALU]>; +def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), + "movs{wq|x}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sextloadi64i16 addr:$src))], IIC_MOVSX>, + TB, Sched<[WriteALULd]>; +def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src), + "movs{lq|xd}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sext GR32:$src))], IIC_MOVSX>, + Sched<[WriteALU]>; +def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src), + "movs{lq|xd}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sextloadi64i32 addr:$src))], IIC_MOVSX>, + Sched<[WriteALULd]>; + +// movzbq and movzwq encodings for the disassembler +def MOVZX64rr8_Q : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src), + "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, + TB, Sched<[WriteALU]>; +def MOVZX64rm8_Q : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src), + "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, + TB, Sched<[WriteALULd]>; +def MOVZX64rr16_Q : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src), + "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, + TB, Sched<[WriteALU]>; +def MOVZX64rm16_Q : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), + "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, + TB, Sched<[WriteALULd]>; + +// FIXME: These should be Pat patterns. +let isCodeGenOnly = 1 in { + +// Use movzbl instead of movzbq when the destination is a register; it's +// equivalent due to implicit zero-extending, and it has a smaller encoding. +def MOVZX64rr8 : I<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src), + "", [(set GR64:$dst, (zext GR8:$src))], IIC_MOVZX>, TB, + Sched<[WriteALU]>; +def MOVZX64rm8 : I<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src), + "", [(set GR64:$dst, (zextloadi64i8 addr:$src))], IIC_MOVZX>, + TB, Sched<[WriteALULd]>; +// Use movzwl instead of movzwq when the destination is a register; it's +// equivalent due to implicit zero-extending, and it has a smaller encoding. +def MOVZX64rr16: I<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src), + "", [(set GR64:$dst, (zext GR16:$src))], IIC_MOVZX>, TB, + Sched<[WriteALU]>; +def MOVZX64rm16: I<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), + "", [(set GR64:$dst, (zextloadi64i16 addr:$src))], + IIC_MOVZX>, TB, Sched<[WriteALULd]>; + +// There's no movzlq instruction, but movl can be used for this purpose, using +// implicit zero-extension. The preferred way to do 32-bit-to-64-bit zero +// extension on x86-64 is to use a SUBREG_TO_REG to utilize implicit +// zero-extension, however this isn't possible when the 32-bit value is +// defined by a truncate or is copied from something where the high bits aren't +// necessarily all zero. In such cases, we fall back to these explicit zext +// instructions. +def MOVZX64rr32 : I<0x89, MRMDestReg, (outs GR64:$dst), (ins GR32:$src), + "", [(set GR64:$dst, (zext GR32:$src))], IIC_MOVZX>, + Sched<[WriteALU]>; +def MOVZX64rm32 : I<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src), + "", [(set GR64:$dst, (zextloadi64i32 addr:$src))], + IIC_MOVZX>, Sched<[WriteALULd]>; +} + diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm/lib/Target/X86/X86InstrFMA.td new file mode 100644 index 0000000..7759a8a2 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrFMA.td @@ -0,0 +1,368 @@ +//===-- X86InstrFMA.td - FMA Instruction Set ---------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes FMA (Fused Multiply-Add) instructions. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// FMA3 - Intel 3 operand Fused Multiply-Add instructions +//===----------------------------------------------------------------------===// + +let Constraints = "$src1 = $dst" in { +multiclass fma3p_rm<bits<8> opc, string OpcodeStr, + PatFrag MemFrag128, PatFrag MemFrag256, + ValueType OpVT128, ValueType OpVT256, + SDPatternOperator Op = null_frag> { + let isCommutable = 1 in + def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, (OpVT128 (Op VR128:$src2, + VR128:$src1, VR128:$src3)))]>; + + let mayLoad = 1 in + def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, f128mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1, + (MemFrag128 addr:$src3))))]>; + + let isCommutable = 1 in + def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1, + VR256:$src3)))]>, VEX_L; + + let mayLoad = 1 in + def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, f256mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR256:$dst, + (OpVT256 (Op VR256:$src2, VR256:$src1, + (MemFrag256 addr:$src3))))]>, VEX_L; +} +} // Constraints = "$src1 = $dst" + +multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, + string OpcodeStr, string PackTy, + PatFrag MemFrag128, PatFrag MemFrag256, + SDNode Op, ValueType OpTy128, ValueType OpTy256> { + defm r213 : fma3p_rm<opc213, + !strconcat(OpcodeStr, "213", PackTy), + MemFrag128, MemFrag256, OpTy128, OpTy256, Op>; +let neverHasSideEffects = 1 in { + defm r132 : fma3p_rm<opc132, + !strconcat(OpcodeStr, "132", PackTy), + MemFrag128, MemFrag256, OpTy128, OpTy256>; + defm r231 : fma3p_rm<opc231, + !strconcat(OpcodeStr, "231", PackTy), + MemFrag128, MemFrag256, OpTy128, OpTy256>; +} // neverHasSideEffects = 1 +} + +// Fused Multiply-Add +let ExeDomain = SSEPackedSingle in { + defm VFMADDPS : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", memopv4f32, + memopv8f32, X86Fmadd, v4f32, v8f32>; + defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", memopv4f32, + memopv8f32, X86Fmsub, v4f32, v8f32>; + defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", + memopv4f32, memopv8f32, X86Fmaddsub, + v4f32, v8f32>; + defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps", + memopv4f32, memopv8f32, X86Fmsubadd, + v4f32, v8f32>; +} + +let ExeDomain = SSEPackedDouble in { + defm VFMADDPD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", memopv2f64, + memopv4f64, X86Fmadd, v2f64, v4f64>, VEX_W; + defm VFMSUBPD : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", memopv2f64, + memopv4f64, X86Fmsub, v2f64, v4f64>, VEX_W; + defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", + memopv2f64, memopv4f64, X86Fmaddsub, + v2f64, v4f64>, VEX_W; + defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd", + memopv2f64, memopv4f64, X86Fmsubadd, + v2f64, v4f64>, VEX_W; +} + +// Fused Negative Multiply-Add +let ExeDomain = SSEPackedSingle in { + defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", memopv4f32, + memopv8f32, X86Fnmadd, v4f32, v8f32>; + defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", memopv4f32, + memopv8f32, X86Fnmsub, v4f32, v8f32>; +} +let ExeDomain = SSEPackedDouble in { + defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", memopv2f64, + memopv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W; + defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", + memopv2f64, memopv4f64, X86Fnmsub, v2f64, + v4f64>, VEX_W; +} + +let Constraints = "$src1 = $dst" in { +multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop, + RegisterClass RC, ValueType OpVT, PatFrag mem_frag, + SDPatternOperator OpNode = null_frag> { + let isCommutable = 1 in + def r : FMA3<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set RC:$dst, + (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>; + let mayLoad = 1 in + def m : FMA3<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set RC:$dst, + (OpVT (OpNode RC:$src2, RC:$src1, + (mem_frag addr:$src3))))]>; +} + +multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, Operand memop, + ComplexPattern mem_cpat, Intrinsic IntId, + RegisterClass RC> { + let isCommutable = 1 in + def r_Int : FMA3<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, (IntId VR128:$src2, VR128:$src1, + VR128:$src3))]>; + def m_Int : FMA3<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, + (IntId VR128:$src2, VR128:$src1, mem_cpat:$src3))]>; +} +} // Constraints = "$src1 = $dst" + +multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, + string OpStr, string PackTy, Intrinsic Int, + SDNode OpNode, RegisterClass RC, ValueType OpVT, + X86MemOperand x86memop, Operand memop, PatFrag mem_frag, + ComplexPattern mem_cpat> { +let neverHasSideEffects = 1 in { + defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), + x86memop, RC, OpVT, mem_frag>; + defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), + x86memop, RC, OpVT, mem_frag>; +} + +defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), + x86memop, RC, OpVT, mem_frag, OpNode>, + fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy), + memop, mem_cpat, Int, RC>; +} + +multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231, + string OpStr, Intrinsic IntF32, Intrinsic IntF64, + SDNode OpNode> { + defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", IntF32, OpNode, + FR32, f32, f32mem, ssmem, loadf32, sse_load_f32>; + defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", IntF64, OpNode, + FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W; +} + +defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss, + int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG; +defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss, + int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG; + +defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss, + int_x86_fma_vfnmadd_sd, X86Fnmadd>, VEX_LIG; +defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss, + int_x86_fma_vfnmsub_sd, X86Fnmsub>, VEX_LIG; + + +//===----------------------------------------------------------------------===// +// FMA4 - AMD 4 operand Fused Multiply-Add instructions +//===----------------------------------------------------------------------===// + + +multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, ValueType OpVT, SDNode OpNode, + PatFrag mem_frag> { + let isCommutable = 1 in + def rr : FMA4<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, MemOp4; + def rm : FMA4<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, (OpNode RC:$src1, RC:$src2, + (mem_frag addr:$src3)))]>, VEX_W, MemOp4; + def mr : FMA4<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, + (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>; +// For disassembler +let isCodeGenOnly = 1, hasSideEffects = 0 in + def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>; +} + +multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop, + ComplexPattern mem_cpat, Intrinsic Int> { + let isCommutable = 1 in + def rr_Int : FMA4<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4; + def rm_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, + mem_cpat:$src3))]>, VEX_W, MemOp4; + def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, memop:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>; +} + +multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT128, ValueType OpVT256, + PatFrag ld_frag128, PatFrag ld_frag256> { + let isCommutable = 1 in + def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>, + VEX_W, MemOp4; + def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, f128mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2, + (ld_frag128 addr:$src3)))]>, VEX_W, MemOp4; + def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>; + let isCommutable = 1 in + def rrY : FMA4<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR256:$dst, + (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>, + VEX_W, MemOp4, VEX_L; + def rmY : FMA4<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, f256mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2, + (ld_frag256 addr:$src3)))]>, VEX_W, MemOp4, VEX_L; + def mrY : FMA4<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR256:$dst, (OpNode VR256:$src1, + (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L; +// For disassembler +let isCodeGenOnly = 1, hasSideEffects = 0 in { + def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>; + def rrY_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, + VEX_L; +} // isCodeGenOnly = 1 +} + +defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>, + fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32, + int_x86_fma_vfmadd_ss>; +defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>, + fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64, + int_x86_fma_vfmadd_sd>; +defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>, + fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32, + int_x86_fma_vfmsub_ss>; +defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>, + fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64, + int_x86_fma_vfmsub_sd>; +defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32, + X86Fnmadd, loadf32>, + fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32, + int_x86_fma_vfnmadd_ss>; +defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64, + X86Fnmadd, loadf64>, + fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64, + int_x86_fma_vfnmadd_sd>; +defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32, + X86Fnmsub, loadf32>, + fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32, + int_x86_fma_vfnmsub_ss>; +defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64, + X86Fnmsub, loadf64>, + fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64, + int_x86_fma_vfnmsub_sd>; + +let ExeDomain = SSEPackedSingle in { + defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32, + memopv4f32, memopv8f32>; + defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32, + memopv4f32, memopv8f32>; + defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32, + memopv4f32, memopv8f32>; + defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32, + memopv4f32, memopv8f32>; + defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32, + memopv4f32, memopv8f32>; + defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32, + memopv4f32, memopv8f32>; +} + +let ExeDomain = SSEPackedDouble in { + defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64, + memopv2f64, memopv4f64>; + defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64, + memopv2f64, memopv4f64>; + defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64, + memopv2f64, memopv4f64>; + defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64, + memopv2f64, memopv4f64>; + defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64, + memopv2f64, memopv4f64>; + defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64, + memopv2f64, memopv4f64>; +} + diff --git a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td new file mode 100644 index 0000000..2224a08 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td @@ -0,0 +1,692 @@ +//===- X86InstrFPStack.td - FPU Instruction Set ------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 x87 FPU instruction set, defining the +// instructions, and properties of the instructions which are needed for code +// generation, machine code emission, and analysis. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// FPStack specific DAG Nodes. +//===----------------------------------------------------------------------===// + +def SDTX86FpGet2 : SDTypeProfile<2, 0, [SDTCisVT<0, f80>, + SDTCisVT<1, f80>]>; +def SDTX86Fld : SDTypeProfile<1, 2, [SDTCisFP<0>, + SDTCisPtrTy<1>, + SDTCisVT<2, OtherVT>]>; +def SDTX86Fst : SDTypeProfile<0, 3, [SDTCisFP<0>, + SDTCisPtrTy<1>, + SDTCisVT<2, OtherVT>]>; +def SDTX86Fild : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>, + SDTCisVT<2, OtherVT>]>; +def SDTX86Fnstsw : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i16>]>; +def SDTX86FpToIMem : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>; + +def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; + +def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86fst : SDNode<"X86ISD::FST", SDTX86Fst, + [SDNPHasChain, SDNPInGlue, SDNPMayStore, + SDNPMemOperand]>; +def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86fildflag : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild, + [SDNPHasChain, SDNPOutGlue, SDNPMayLoad, + SDNPMemOperand]>; +def X86fp_stsw : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>; +def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore, + [SDNPHasChain, SDNPMayStore, SDNPSideEffect, + SDNPMemOperand]>; + +//===----------------------------------------------------------------------===// +// FPStack pattern fragments +//===----------------------------------------------------------------------===// + +def fpimm0 : PatLeaf<(fpimm), [{ + return N->isExactlyValue(+0.0); +}]>; + +def fpimmneg0 : PatLeaf<(fpimm), [{ + return N->isExactlyValue(-0.0); +}]>; + +def fpimm1 : PatLeaf<(fpimm), [{ + return N->isExactlyValue(+1.0); +}]>; + +def fpimmneg1 : PatLeaf<(fpimm), [{ + return N->isExactlyValue(-1.0); +}]>; + +// Some 'special' instructions +let usesCustomInserter = 1 in { // Expanded after instruction selection. + def FP32_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP32:$src), + [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>; + def FP32_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP32:$src), + [(X86fp_to_i32mem RFP32:$src, addr:$dst)]>; + def FP32_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP32:$src), + [(X86fp_to_i64mem RFP32:$src, addr:$dst)]>; + def FP64_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP64:$src), + [(X86fp_to_i16mem RFP64:$src, addr:$dst)]>; + def FP64_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP64:$src), + [(X86fp_to_i32mem RFP64:$src, addr:$dst)]>; + def FP64_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP64:$src), + [(X86fp_to_i64mem RFP64:$src, addr:$dst)]>; + def FP80_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP80:$src), + [(X86fp_to_i16mem RFP80:$src, addr:$dst)]>; + def FP80_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP80:$src), + [(X86fp_to_i32mem RFP80:$src, addr:$dst)]>; + def FP80_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP80:$src), + [(X86fp_to_i64mem RFP80:$src, addr:$dst)]>; +} + +// All FP Stack operations are represented with four instructions here. The +// first three instructions, generated by the instruction selector, use "RFP32" +// "RFP64" or "RFP80" registers: traditional register files to reference 32-bit, +// 64-bit or 80-bit floating point values. These sizes apply to the values, +// not the registers, which are always 80 bits; RFP32, RFP64 and RFP80 can be +// copied to each other without losing information. These instructions are all +// pseudo instructions and use the "_Fp" suffix. +// In some cases there are additional variants with a mixture of different +// register sizes. +// The second instruction is defined with FPI, which is the actual instruction +// emitted by the assembler. These use "RST" registers, although frequently +// the actual register(s) used are implicit. These are always 80 bits. +// The FP stackifier pass converts one to the other after register allocation +// occurs. +// +// Note that the FpI instruction should have instruction selection info (e.g. +// a pattern) and the FPI instruction should have emission info (e.g. opcode +// encoding and asm printing info). + +// Pseudo Instruction for FP stack return values. +def FpPOP_RETVAL : FpI_<(outs RFP80:$dst), (ins), SpecialFP, []>; + +// FpIf32, FpIf64 - Floating Point Pseudo Instruction template. +// f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1. +// f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2. +// f80 instructions cannot use SSE and use neither of these. +class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern> : + FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32]>; +class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern> : + FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64]>; + +// Factoring for arithmetic. +multiclass FPBinary_rr<SDNode OpNode> { +// Register op register -> register +// These are separated out because they have no reversed form. +def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), TwoArgFP, + [(set RFP32:$dst, (OpNode RFP32:$src1, RFP32:$src2))]>; +def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), TwoArgFP, + [(set RFP64:$dst, (OpNode RFP64:$src1, RFP64:$src2))]>; +def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP, + [(set RFP80:$dst, (OpNode RFP80:$src1, RFP80:$src2))]>; +} +// The FopST0 series are not included here because of the irregularities +// in where the 'r' goes in assembly output. +// These instructions cannot address 80-bit memory. +multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> { +// ST(0) = ST(0) + [mem] +def _Fp32m : FpIf32<(outs RFP32:$dst), + (ins RFP32:$src1, f32mem:$src2), OneArgFPRW, + [(set RFP32:$dst, + (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>; +def _Fp64m : FpIf64<(outs RFP64:$dst), + (ins RFP64:$src1, f64mem:$src2), OneArgFPRW, + [(set RFP64:$dst, + (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>; +def _Fp64m32: FpIf64<(outs RFP64:$dst), + (ins RFP64:$src1, f32mem:$src2), OneArgFPRW, + [(set RFP64:$dst, + (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2))))]>; +def _Fp80m32: FpI_<(outs RFP80:$dst), + (ins RFP80:$src1, f32mem:$src2), OneArgFPRW, + [(set RFP80:$dst, + (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2))))]>; +def _Fp80m64: FpI_<(outs RFP80:$dst), + (ins RFP80:$src1, f64mem:$src2), OneArgFPRW, + [(set RFP80:$dst, + (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2))))]>; +def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src), + !strconcat("f", asmstring, "{s}\t$src")> { + let mayLoad = 1; +} +def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src), + !strconcat("f", asmstring, "{l}\t$src")> { + let mayLoad = 1; +} +// ST(0) = ST(0) + [memint] +def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), + OneArgFPRW, + [(set RFP32:$dst, (OpNode RFP32:$src1, + (X86fild addr:$src2, i16)))]>; +def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), + OneArgFPRW, + [(set RFP32:$dst, (OpNode RFP32:$src1, + (X86fild addr:$src2, i32)))]>; +def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), + OneArgFPRW, + [(set RFP64:$dst, (OpNode RFP64:$src1, + (X86fild addr:$src2, i16)))]>; +def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), + OneArgFPRW, + [(set RFP64:$dst, (OpNode RFP64:$src1, + (X86fild addr:$src2, i32)))]>; +def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2), + OneArgFPRW, + [(set RFP80:$dst, (OpNode RFP80:$src1, + (X86fild addr:$src2, i16)))]>; +def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2), + OneArgFPRW, + [(set RFP80:$dst, (OpNode RFP80:$src1, + (X86fild addr:$src2, i32)))]>; +def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src), + !strconcat("fi", asmstring, "{s}\t$src")> { + let mayLoad = 1; +} +def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src), + !strconcat("fi", asmstring, "{l}\t$src")> { + let mayLoad = 1; +} +} + +let Defs = [FPSW] in { +defm ADD : FPBinary_rr<fadd>; +defm SUB : FPBinary_rr<fsub>; +defm MUL : FPBinary_rr<fmul>; +defm DIV : FPBinary_rr<fdiv>; +defm ADD : FPBinary<fadd, MRM0m, "add">; +defm SUB : FPBinary<fsub, MRM4m, "sub">; +defm SUBR: FPBinary<fsub ,MRM5m, "subr">; +defm MUL : FPBinary<fmul, MRM1m, "mul">; +defm DIV : FPBinary<fdiv, MRM6m, "div">; +defm DIVR: FPBinary<fdiv, MRM7m, "divr">; +} + +class FPST0rInst<bits<8> o, string asm> + : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, D8; +class FPrST0Inst<bits<8> o, string asm> + : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, DC; +class FPrST0PInst<bits<8> o, string asm> + : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, DE; + +// NOTE: GAS and apparently all other AT&T style assemblers have a broken notion +// of some of the 'reverse' forms of the fsub and fdiv instructions. As such, +// we have to put some 'r's in and take them out of weird places. +def ADD_FST0r : FPST0rInst <0xC0, "fadd\t$op">; +def ADD_FrST0 : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, ST(0)}">; +def ADD_FPrST0 : FPrST0PInst<0xC0, "faddp\t$op">; +def SUBR_FST0r : FPST0rInst <0xE8, "fsubr\t$op">; +def SUB_FrST0 : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, ST(0)}">; +def SUB_FPrST0 : FPrST0PInst<0xE8, "fsub{r}p\t$op">; +def SUB_FST0r : FPST0rInst <0xE0, "fsub\t$op">; +def SUBR_FrST0 : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, ST(0)}">; +def SUBR_FPrST0 : FPrST0PInst<0xE0, "fsub{|r}p\t$op">; +def MUL_FST0r : FPST0rInst <0xC8, "fmul\t$op">; +def MUL_FrST0 : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, ST(0)}">; +def MUL_FPrST0 : FPrST0PInst<0xC8, "fmulp\t$op">; +def DIVR_FST0r : FPST0rInst <0xF8, "fdivr\t$op">; +def DIV_FrST0 : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, ST(0)}">; +def DIV_FPrST0 : FPrST0PInst<0xF8, "fdiv{r}p\t$op">; +def DIV_FST0r : FPST0rInst <0xF0, "fdiv\t$op">; +def DIVR_FrST0 : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, ST(0)}">; +def DIVR_FPrST0 : FPrST0PInst<0xF0, "fdiv{|r}p\t$op">; + +def COM_FST0r : FPST0rInst <0xD0, "fcom\t$op">; +def COMP_FST0r : FPST0rInst <0xD8, "fcomp\t$op">; + +// Unary operations. +multiclass FPUnary<SDNode OpNode, bits<8> opcode, string asmstring> { +def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW, + [(set RFP32:$dst, (OpNode RFP32:$src))]>; +def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW, + [(set RFP64:$dst, (OpNode RFP64:$src))]>; +def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW, + [(set RFP80:$dst, (OpNode RFP80:$src))]>; +def _F : FPI<opcode, RawFrm, (outs), (ins), asmstring>, D9; +} + +let Defs = [FPSW] in { +defm CHS : FPUnary<fneg, 0xE0, "fchs">; +defm ABS : FPUnary<fabs, 0xE1, "fabs">; +defm SQRT: FPUnary<fsqrt,0xFA, "fsqrt">; +defm SIN : FPUnary<fsin, 0xFE, "fsin">; +defm COS : FPUnary<fcos, 0xFF, "fcos">; + +let neverHasSideEffects = 1 in { +def TST_Fp32 : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>; +def TST_Fp64 : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>; +def TST_Fp80 : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>; +} +def TST_F : FPI<0xE4, RawFrm, (outs), (ins), "ftst">, D9; +} // Defs = [FPSW] + +// Versions of FP instructions that take a single memory operand. Added for the +// disassembler; remove as they are included with patterns elsewhere. +def FCOM32m : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">; +def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">; + +def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">; +def FSTENVm : FPI<0xD9, MRM6m, (outs f32mem:$dst), (ins), "fnstenv\t$dst">; + +def FICOM32m : FPI<0xDA, MRM2m, (outs), (ins i32mem:$src), "ficom{l}\t$src">; +def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">; + +def FCOM64m : FPI<0xDC, MRM2m, (outs), (ins f64mem:$src), "fcom{l}\t$src">; +def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">; + +def FRSTORm : FPI<0xDD, MRM4m, (outs f32mem:$dst), (ins), "frstor\t$dst">; +def FSAVEm : FPI<0xDD, MRM6m, (outs f32mem:$dst), (ins), "fnsave\t$dst">; +def FNSTSWm : FPI<0xDD, MRM7m, (outs f32mem:$dst), (ins), "fnstsw\t$dst">; + +def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">; +def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">; + +def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f32mem:$src), "fbld\t$src">; +def FBSTPm : FPI<0xDF, MRM6m, (outs f32mem:$dst), (ins), "fbstp\t$dst">; + +// Floating point cmovs. +class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> : + FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32, HasCMov]>; +class FpIf64CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> : + FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64, HasCMov]>; + +multiclass FPCMov<PatLeaf cc> { + def _Fp32 : FpIf32CMov<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), + CondMovFP, + [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2, + cc, EFLAGS))]>; + def _Fp64 : FpIf64CMov<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), + CondMovFP, + [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2, + cc, EFLAGS))]>; + def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), + CondMovFP, + [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2, + cc, EFLAGS))]>, + Requires<[HasCMov]>; +} + +let Defs = [FPSW] in { +let Uses = [EFLAGS], Constraints = "$src1 = $dst" in { +defm CMOVB : FPCMov<X86_COND_B>; +defm CMOVBE : FPCMov<X86_COND_BE>; +defm CMOVE : FPCMov<X86_COND_E>; +defm CMOVP : FPCMov<X86_COND_P>; +defm CMOVNB : FPCMov<X86_COND_AE>; +defm CMOVNBE: FPCMov<X86_COND_A>; +defm CMOVNE : FPCMov<X86_COND_NE>; +defm CMOVNP : FPCMov<X86_COND_NP>; +} // Uses = [EFLAGS], Constraints = "$src1 = $dst" + +let Predicates = [HasCMov] in { +// These are not factored because there's no clean way to pass DA/DB. +def CMOVB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins), + "fcmovb\t{$op, %st(0)|ST(0), $op}">, DA; +def CMOVBE_F : FPI<0xD0, AddRegFrm, (outs RST:$op), (ins), + "fcmovbe\t{$op, %st(0)|ST(0), $op}">, DA; +def CMOVE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins), + "fcmove\t{$op, %st(0)|ST(0), $op}">, DA; +def CMOVP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins), + "fcmovu\t {$op, %st(0)|ST(0), $op}">, DA; +def CMOVNB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins), + "fcmovnb\t{$op, %st(0)|ST(0), $op}">, DB; +def CMOVNBE_F: FPI<0xD0, AddRegFrm, (outs RST:$op), (ins), + "fcmovnbe\t{$op, %st(0)|ST(0), $op}">, DB; +def CMOVNE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins), + "fcmovne\t{$op, %st(0)|ST(0), $op}">, DB; +def CMOVNP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins), + "fcmovnu\t{$op, %st(0)|ST(0), $op}">, DB; +} // Predicates = [HasCMov] + +// Floating point loads & stores. +let canFoldAsLoad = 1 in { +def LD_Fp32m : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP, + [(set RFP32:$dst, (loadf32 addr:$src))]>; +let isReMaterializable = 1 in + def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP, + [(set RFP64:$dst, (loadf64 addr:$src))]>; +def LD_Fp80m : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP, + [(set RFP80:$dst, (loadf80 addr:$src))]>; +} +def LD_Fp32m64 : FpIf64<(outs RFP64:$dst), (ins f32mem:$src), ZeroArgFP, + [(set RFP64:$dst, (f64 (extloadf32 addr:$src)))]>; +def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP, + [(set RFP80:$dst, (f80 (extloadf64 addr:$src)))]>; +def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP, + [(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>; +def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP, + [(set RFP32:$dst, (X86fild addr:$src, i16))]>; +def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP, + [(set RFP32:$dst, (X86fild addr:$src, i32))]>; +def ILD_Fp64m32: FpIf32<(outs RFP32:$dst), (ins i64mem:$src), ZeroArgFP, + [(set RFP32:$dst, (X86fild addr:$src, i64))]>; +def ILD_Fp16m64: FpIf64<(outs RFP64:$dst), (ins i16mem:$src), ZeroArgFP, + [(set RFP64:$dst, (X86fild addr:$src, i16))]>; +def ILD_Fp32m64: FpIf64<(outs RFP64:$dst), (ins i32mem:$src), ZeroArgFP, + [(set RFP64:$dst, (X86fild addr:$src, i32))]>; +def ILD_Fp64m64: FpIf64<(outs RFP64:$dst), (ins i64mem:$src), ZeroArgFP, + [(set RFP64:$dst, (X86fild addr:$src, i64))]>; +def ILD_Fp16m80: FpI_<(outs RFP80:$dst), (ins i16mem:$src), ZeroArgFP, + [(set RFP80:$dst, (X86fild addr:$src, i16))]>; +def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP, + [(set RFP80:$dst, (X86fild addr:$src, i32))]>; +def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP, + [(set RFP80:$dst, (X86fild addr:$src, i64))]>; + +def ST_Fp32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, + [(store RFP32:$src, addr:$op)]>; +def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, + [(truncstoref32 RFP64:$src, addr:$op)]>; +def ST_Fp64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, + [(store RFP64:$src, addr:$op)]>; +def ST_Fp80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, + [(truncstoref32 RFP80:$src, addr:$op)]>; +def ST_Fp80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, + [(truncstoref64 RFP80:$src, addr:$op)]>; +// FST does not support 80-bit memory target; FSTP must be used. + +let mayStore = 1, neverHasSideEffects = 1 in { +def ST_FpP32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, []>; +def ST_FpP64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>; +def ST_FpP64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>; +def ST_FpP80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, []>; +def ST_FpP80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>; +} +def ST_FpP80m : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP, + [(store RFP80:$src, addr:$op)]>; +let mayStore = 1, neverHasSideEffects = 1 in { +def IST_Fp16m32 : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>; +def IST_Fp32m32 : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>; +def IST_Fp64m32 : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>; +def IST_Fp16m64 : FpIf64<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, []>; +def IST_Fp32m64 : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, []>; +def IST_Fp64m64 : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, []>; +def IST_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>; +def IST_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>; +def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>; +} + +let mayLoad = 1, SchedRW = [WriteLoad] in { +def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src", + IIC_FLD>; +def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src", + IIC_FLD>; +def LD_F80m : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src", + IIC_FLD80>; +def ILD_F16m : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src", + IIC_FILD>; +def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src", + IIC_FILD>; +def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src", + IIC_FILD>; +} +let mayStore = 1, SchedRW = [WriteStore] in { +def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst", + IIC_FST>; +def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst", + IIC_FST>; +def ST_FP32m : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst", + IIC_FST>; +def ST_FP64m : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst", + IIC_FST>; +def ST_FP80m : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst", + IIC_FST80>; +def IST_F16m : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst", + IIC_FIST>; +def IST_F32m : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst", + IIC_FIST>; +def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst", + IIC_FIST>; +def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst", + IIC_FIST>; +def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst", + IIC_FIST>; +} + +// FISTTP requires SSE3 even though it's a FPStack op. +let Predicates = [HasSSE3] in { +def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, + [(X86fp_to_i16mem RFP32:$src, addr:$op)]>; +def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, + [(X86fp_to_i32mem RFP32:$src, addr:$op)]>; +def ISTT_Fp64m32 : FpI_<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, + [(X86fp_to_i64mem RFP32:$src, addr:$op)]>; +def ISTT_Fp16m64 : FpI_<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, + [(X86fp_to_i16mem RFP64:$src, addr:$op)]>; +def ISTT_Fp32m64 : FpI_<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, + [(X86fp_to_i32mem RFP64:$src, addr:$op)]>; +def ISTT_Fp64m64 : FpI_<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, + [(X86fp_to_i64mem RFP64:$src, addr:$op)]>; +def ISTT_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, + [(X86fp_to_i16mem RFP80:$src, addr:$op)]>; +def ISTT_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, + [(X86fp_to_i32mem RFP80:$src, addr:$op)]>; +def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, + [(X86fp_to_i64mem RFP80:$src, addr:$op)]>; +} // Predicates = [HasSSE3] + +let mayStore = 1, SchedRW = [WriteStore] in { +def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst", + IIC_FST>; +def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst", + IIC_FST>; +def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), + "fisttp{ll}\t$dst", IIC_FST>; +} + +// FP Stack manipulation instructions. +let SchedRW = [WriteMove] in { +def LD_Frr : FPI<0xC0, AddRegFrm, (outs), (ins RST:$op), "fld\t$op", + IIC_FLD>, D9; +def ST_Frr : FPI<0xD0, AddRegFrm, (outs), (ins RST:$op), "fst\t$op", + IIC_FST>, DD; +def ST_FPrr : FPI<0xD8, AddRegFrm, (outs), (ins RST:$op), "fstp\t$op", + IIC_FST>, DD; +def XCH_F : FPI<0xC8, AddRegFrm, (outs), (ins RST:$op), "fxch\t$op", + IIC_FXCH>, D9; +} + +// Floating point constant loads. +let isReMaterializable = 1 in { +def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP, + [(set RFP32:$dst, fpimm0)]>; +def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP, + [(set RFP32:$dst, fpimm1)]>; +def LD_Fp064 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP, + [(set RFP64:$dst, fpimm0)]>; +def LD_Fp164 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP, + [(set RFP64:$dst, fpimm1)]>; +def LD_Fp080 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP, + [(set RFP80:$dst, fpimm0)]>; +def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP, + [(set RFP80:$dst, fpimm1)]>; +} + +let SchedRW = [WriteZero] in { +def LD_F0 : FPI<0xEE, RawFrm, (outs), (ins), "fldz", IIC_FLDZ>, D9; +def LD_F1 : FPI<0xE8, RawFrm, (outs), (ins), "fld1", IIC_FIST>, D9; +} + +// Floating point compares. +let SchedRW = [WriteFAdd] in { +def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, + [(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>; +def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, + [(set FPSW, (trunc (X86cmp RFP64:$lhs, RFP64:$rhs)))]>; +def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, + [(set FPSW, (trunc (X86cmp RFP80:$lhs, RFP80:$rhs)))]>; +} // SchedRW +} // Defs = [FPSW] + +let SchedRW = [WriteFAdd] in { +// CC = ST(0) cmp ST(i) +let Defs = [EFLAGS, FPSW] in { +def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, + [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>; +def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, + [(set EFLAGS, (X86cmp RFP64:$lhs, RFP64:$rhs))]>; +def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, + [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>; +} + +let Defs = [FPSW], Uses = [ST0] in { +def UCOM_Fr : FPI<0xE0, AddRegFrm, // FPSW = cmp ST(0) with ST(i) + (outs), (ins RST:$reg), + "fucom\t$reg", IIC_FUCOM>, DD; +def UCOM_FPr : FPI<0xE8, AddRegFrm, // FPSW = cmp ST(0) with ST(i), pop + (outs), (ins RST:$reg), + "fucomp\t$reg", IIC_FUCOM>, DD; +def UCOM_FPPr : FPI<0xE9, RawFrm, // cmp ST(0) with ST(1), pop, pop + (outs), (ins), + "fucompp", IIC_FUCOM>, DA; +} + +let Defs = [EFLAGS, FPSW], Uses = [ST0] in { +def UCOM_FIr : FPI<0xE8, AddRegFrm, // CC = cmp ST(0) with ST(i) + (outs), (ins RST:$reg), + "fucomi\t$reg", IIC_FUCOMI>, DB; +def UCOM_FIPr : FPI<0xE8, AddRegFrm, // CC = cmp ST(0) with ST(i), pop + (outs), (ins RST:$reg), + "fucompi\t$reg", IIC_FUCOMI>, DF; +} + +let Defs = [EFLAGS, FPSW] in { +def COM_FIr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg), + "fcomi\t$reg", IIC_FCOMI>, DB; +def COM_FIPr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg), + "fcompi\t$reg", IIC_FCOMI>, DF; +} +} // SchedRW + +// Floating point flag ops. +let SchedRW = [WriteALU] in { +let Defs = [AX], Uses = [FPSW] in +def FNSTSW16r : I<0xE0, RawFrm, // AX = fp flags + (outs), (ins), "fnstsw %ax", + [(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>, DF; + +def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world + (outs), (ins i16mem:$dst), "fnstcw\t$dst", + [(X86fp_cwd_get16 addr:$dst)], IIC_FNSTCW>; +} // SchedRW +let mayLoad = 1 in +def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16] + (outs), (ins i16mem:$dst), "fldcw\t$dst", [], IIC_FLDCW>, + Sched<[WriteLoad]>; + +// FPU control instructions +let SchedRW = [WriteMicrocoded] in { +let Defs = [FPSW] in +def FNINIT : I<0xE3, RawFrm, (outs), (ins), "fninit", [], IIC_FNINIT>, DB; +def FFREE : FPI<0xC0, AddRegFrm, (outs), (ins RST:$reg), + "ffree\t$reg", IIC_FFREE>, DD; +// Clear exceptions + +let Defs = [FPSW] in +def FNCLEX : I<0xE2, RawFrm, (outs), (ins), "fnclex", [], IIC_FNCLEX>, DB; +} // SchedRW + +// Operandless floating-point instructions for the disassembler. +let SchedRW = [WriteMicrocoded] in { +def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>; + +def FNOP : I<0xD0, RawFrm, (outs), (ins), "fnop", [], IIC_FNOP>, D9; +def FXAM : I<0xE5, RawFrm, (outs), (ins), "fxam", [], IIC_FXAM>, D9; +def FLDL2T : I<0xE9, RawFrm, (outs), (ins), "fldl2t", [], IIC_FLDL>, D9; +def FLDL2E : I<0xEA, RawFrm, (outs), (ins), "fldl2e", [], IIC_FLDL>, D9; +def FLDPI : I<0xEB, RawFrm, (outs), (ins), "fldpi", [], IIC_FLDL>, D9; +def FLDLG2 : I<0xEC, RawFrm, (outs), (ins), "fldlg2", [], IIC_FLDL>, D9; +def FLDLN2 : I<0xED, RawFrm, (outs), (ins), "fldln2", [], IIC_FLDL>, D9; +def F2XM1 : I<0xF0, RawFrm, (outs), (ins), "f2xm1", [], IIC_F2XM1>, D9; +def FYL2X : I<0xF1, RawFrm, (outs), (ins), "fyl2x", [], IIC_FYL2X>, D9; +def FPTAN : I<0xF2, RawFrm, (outs), (ins), "fptan", [], IIC_FPTAN>, D9; +def FPATAN : I<0xF3, RawFrm, (outs), (ins), "fpatan", [], IIC_FPATAN>, D9; +def FXTRACT : I<0xF4, RawFrm, (outs), (ins), "fxtract", [], IIC_FXTRACT>, D9; +def FPREM1 : I<0xF5, RawFrm, (outs), (ins), "fprem1", [], IIC_FPREM1>, D9; +def FDECSTP : I<0xF6, RawFrm, (outs), (ins), "fdecstp", [], IIC_FPSTP>, D9; +def FINCSTP : I<0xF7, RawFrm, (outs), (ins), "fincstp", [], IIC_FPSTP>, D9; +def FPREM : I<0xF8, RawFrm, (outs), (ins), "fprem", [], IIC_FPREM>, D9; +def FYL2XP1 : I<0xF9, RawFrm, (outs), (ins), "fyl2xp1", [], IIC_FYL2XP1>, D9; +def FSINCOS : I<0xFB, RawFrm, (outs), (ins), "fsincos", [], IIC_FSINCOS>, D9; +def FRNDINT : I<0xFC, RawFrm, (outs), (ins), "frndint", [], IIC_FRNDINT>, D9; +def FSCALE : I<0xFD, RawFrm, (outs), (ins), "fscale", [], IIC_FSCALE>, D9; +def FCOMPP : I<0xD9, RawFrm, (outs), (ins), "fcompp", [], IIC_FCOMPP>, DE; + +def FXSAVE : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins), + "fxsave\t$dst", [], IIC_FXSAVE>, TB; +def FXSAVE64 : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins), + "fxsaveq\t$dst", [], IIC_FXSAVE>, TB, REX_W, + Requires<[In64BitMode]>; +def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), + "fxrstor\t$src", [], IIC_FXRSTOR>, TB; +def FXRSTOR64 : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), + "fxrstorq\t$src", [], IIC_FXRSTOR>, TB, REX_W, + Requires<[In64BitMode]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +//===----------------------------------------------------------------------===// + +// Required for RET of f32 / f64 / f80 values. +def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>; +def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>; +def : Pat<(X86fld addr:$src, f80), (LD_Fp80m addr:$src)>; + +// Required for CALL which return f32 / f64 / f80 values. +def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>; +def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op, + RFP64:$src)>; +def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>; +def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op, + RFP80:$src)>; +def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op, + RFP80:$src)>; +def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op, + RFP80:$src)>; + +// Floating point constant -0.0 and -1.0 +def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStackf32]>; +def : Pat<(f32 fpimmneg1), (CHS_Fp32 (LD_Fp132))>, Requires<[FPStackf32]>; +def : Pat<(f64 fpimmneg0), (CHS_Fp64 (LD_Fp064))>, Requires<[FPStackf64]>; +def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStackf64]>; +def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>; +def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>; + +// Used to conv. i64 to f64 since there isn't a SSE version. +def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>; + +// FP extensions map onto simple pseudo-value conversions if they are to/from +// the FP stack. +def : Pat<(f64 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>, + Requires<[FPStackf32]>; +def : Pat<(f80 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>, + Requires<[FPStackf32]>; +def : Pat<(f80 (fextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>, + Requires<[FPStackf64]>; + +// FP truncations map onto simple pseudo-value conversions if they are to/from +// the FP stack. We have validated that only value-preserving truncations make +// it through isel. +def : Pat<(f32 (fround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>, + Requires<[FPStackf32]>; +def : Pat<(f32 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>, + Requires<[FPStackf32]>; +def : Pat<(f64 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>, + Requires<[FPStackf64]>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm/lib/Target/X86/X86InstrFormats.td new file mode 100644 index 0000000..a71e024 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrFormats.td @@ -0,0 +1,660 @@ +//===-- X86InstrFormats.td - X86 Instruction Formats -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// X86 Instruction Format Definitions. +// + +// Format specifies the encoding used by the instruction. This is part of the +// ad-hoc solution used to emit machine instruction encodings by our machine +// code emitter. +class Format<bits<6> val> { + bits<6> Value = val; +} + +def Pseudo : Format<0>; def RawFrm : Format<1>; +def AddRegFrm : Format<2>; def MRMDestReg : Format<3>; +def MRMDestMem : Format<4>; def MRMSrcReg : Format<5>; +def MRMSrcMem : Format<6>; +def MRM0r : Format<16>; def MRM1r : Format<17>; def MRM2r : Format<18>; +def MRM3r : Format<19>; def MRM4r : Format<20>; def MRM5r : Format<21>; +def MRM6r : Format<22>; def MRM7r : Format<23>; +def MRM0m : Format<24>; def MRM1m : Format<25>; def MRM2m : Format<26>; +def MRM3m : Format<27>; def MRM4m : Format<28>; def MRM5m : Format<29>; +def MRM6m : Format<30>; def MRM7m : Format<31>; +def MRMInitReg : Format<32>; +def MRM_C1 : Format<33>; +def MRM_C2 : Format<34>; +def MRM_C3 : Format<35>; +def MRM_C4 : Format<36>; +def MRM_C8 : Format<37>; +def MRM_C9 : Format<38>; +def MRM_CA : Format<39>; +def MRM_CB : Format<40>; +def MRM_E8 : Format<41>; +def MRM_F0 : Format<42>; +def RawFrmImm8 : Format<43>; +def RawFrmImm16 : Format<44>; +def MRM_F8 : Format<45>; +def MRM_F9 : Format<46>; +def MRM_D0 : Format<47>; +def MRM_D1 : Format<48>; +def MRM_D4 : Format<49>; +def MRM_D5 : Format<50>; +def MRM_D6 : Format<51>; +def MRM_D8 : Format<52>; +def MRM_D9 : Format<53>; +def MRM_DA : Format<54>; +def MRM_DB : Format<55>; +def MRM_DC : Format<56>; +def MRM_DD : Format<57>; +def MRM_DE : Format<58>; +def MRM_DF : Format<59>; + +// ImmType - This specifies the immediate type used by an instruction. This is +// part of the ad-hoc solution used to emit machine instruction encodings by our +// machine code emitter. +class ImmType<bits<3> val> { + bits<3> Value = val; +} +def NoImm : ImmType<0>; +def Imm8 : ImmType<1>; +def Imm8PCRel : ImmType<2>; +def Imm16 : ImmType<3>; +def Imm16PCRel : ImmType<4>; +def Imm32 : ImmType<5>; +def Imm32PCRel : ImmType<6>; +def Imm64 : ImmType<7>; + +// FPFormat - This specifies what form this FP instruction has. This is used by +// the Floating-Point stackifier pass. +class FPFormat<bits<3> val> { + bits<3> Value = val; +} +def NotFP : FPFormat<0>; +def ZeroArgFP : FPFormat<1>; +def OneArgFP : FPFormat<2>; +def OneArgFPRW : FPFormat<3>; +def TwoArgFP : FPFormat<4>; +def CompareFP : FPFormat<5>; +def CondMovFP : FPFormat<6>; +def SpecialFP : FPFormat<7>; + +// Class specifying the SSE execution domain, used by the SSEDomainFix pass. +// Keep in sync with tables in X86InstrInfo.cpp. +class Domain<bits<2> val> { + bits<2> Value = val; +} +def GenericDomain : Domain<0>; +def SSEPackedSingle : Domain<1>; +def SSEPackedDouble : Domain<2>; +def SSEPackedInt : Domain<3>; + +// Prefix byte classes which are used to indicate to the ad-hoc machine code +// emitter that various prefix bytes are required. +class OpSize { bit hasOpSizePrefix = 1; } +class AdSize { bit hasAdSizePrefix = 1; } +class REX_W { bit hasREX_WPrefix = 1; } +class LOCK { bit hasLockPrefix = 1; } +class SegFS { bits<2> SegOvrBits = 1; } +class SegGS { bits<2> SegOvrBits = 2; } +class TB { bits<5> Prefix = 1; } +class REP { bits<5> Prefix = 2; } +class D8 { bits<5> Prefix = 3; } +class D9 { bits<5> Prefix = 4; } +class DA { bits<5> Prefix = 5; } +class DB { bits<5> Prefix = 6; } +class DC { bits<5> Prefix = 7; } +class DD { bits<5> Prefix = 8; } +class DE { bits<5> Prefix = 9; } +class DF { bits<5> Prefix = 10; } +class XD { bits<5> Prefix = 11; } +class XS { bits<5> Prefix = 12; } +class T8 { bits<5> Prefix = 13; } +class TA { bits<5> Prefix = 14; } +class A6 { bits<5> Prefix = 15; } +class A7 { bits<5> Prefix = 16; } +class T8XD { bits<5> Prefix = 17; } +class T8XS { bits<5> Prefix = 18; } +class TAXD { bits<5> Prefix = 19; } +class XOP8 { bits<5> Prefix = 20; } +class XOP9 { bits<5> Prefix = 21; } +class VEX { bit hasVEXPrefix = 1; } +class VEX_W { bit hasVEX_WPrefix = 1; } +class VEX_4V : VEX { bit hasVEX_4VPrefix = 1; } +class VEX_4VOp3 : VEX { bit hasVEX_4VOp3Prefix = 1; } +class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; } +class VEX_L { bit hasVEX_L = 1; } +class VEX_LIG { bit ignoresVEX_L = 1; } +class Has3DNow0F0FOpcode { bit has3DNow0F0FOpcode = 1; } +class MemOp4 { bit hasMemOp4Prefix = 1; } +class XOP { bit hasXOP_Prefix = 1; } +class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, + string AsmStr, + InstrItinClass itin, + Domain d = GenericDomain> + : Instruction { + let Namespace = "X86"; + + bits<8> Opcode = opcod; + Format Form = f; + bits<6> FormBits = Form.Value; + ImmType ImmT = i; + + dag OutOperandList = outs; + dag InOperandList = ins; + string AsmString = AsmStr; + + // If this is a pseudo instruction, mark it isCodeGenOnly. + let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo"); + + let Itinerary = itin; + + // + // Attributes specific to X86 instructions... + // + bit hasOpSizePrefix = 0; // Does this inst have a 0x66 prefix? + bit hasAdSizePrefix = 0; // Does this inst have a 0x67 prefix? + + bits<5> Prefix = 0; // Which prefix byte does this inst have? + bit hasREX_WPrefix = 0; // Does this inst require the REX.W prefix? + FPFormat FPForm = NotFP; // What flavor of FP instruction is this? + bit hasLockPrefix = 0; // Does this inst have a 0xF0 prefix? + bits<2> SegOvrBits = 0; // Segment override prefix. + Domain ExeDomain = d; + bit hasVEXPrefix = 0; // Does this inst require a VEX prefix? + bit hasVEX_WPrefix = 0; // Does this inst set the VEX_W field? + bit hasVEX_4VPrefix = 0; // Does this inst require the VEX.VVVV field? + bit hasVEX_4VOp3Prefix = 0; // Does this inst require the VEX.VVVV field to + // encode the third operand? + bit hasVEX_i8ImmReg = 0; // Does this inst require the last source register + // to be encoded in a immediate field? + bit hasVEX_L = 0; // Does this inst use large (256-bit) registers? + bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit + bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding? + bit hasMemOp4Prefix = 0; // Same bit as VEX_W, but used for swapping operands + bit hasXOP_Prefix = 0; // Does this inst require an XOP prefix? + + // TSFlags layout should be kept in sync with X86InstrInfo.h. + let TSFlags{5-0} = FormBits; + let TSFlags{6} = hasOpSizePrefix; + let TSFlags{7} = hasAdSizePrefix; + let TSFlags{12-8} = Prefix; + let TSFlags{13} = hasREX_WPrefix; + let TSFlags{16-14} = ImmT.Value; + let TSFlags{19-17} = FPForm.Value; + let TSFlags{20} = hasLockPrefix; + let TSFlags{22-21} = SegOvrBits; + let TSFlags{24-23} = ExeDomain.Value; + let TSFlags{32-25} = Opcode; + let TSFlags{33} = hasVEXPrefix; + let TSFlags{34} = hasVEX_WPrefix; + let TSFlags{35} = hasVEX_4VPrefix; + let TSFlags{36} = hasVEX_4VOp3Prefix; + let TSFlags{37} = hasVEX_i8ImmReg; + let TSFlags{38} = hasVEX_L; + let TSFlags{39} = ignoresVEX_L; + let TSFlags{40} = has3DNow0F0FOpcode; + let TSFlags{41} = hasMemOp4Prefix; + let TSFlags{42} = hasXOP_Prefix; +} + +class PseudoI<dag oops, dag iops, list<dag> pattern> + : X86Inst<0, Pseudo, NoImm, oops, iops, "", NoItinerary> { + let Pattern = pattern; +} + +class I<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary, + Domain d = GenericDomain> + : X86Inst<o, f, NoImm, outs, ins, asm, itin, d> { + let Pattern = pattern; + let CodeSize = 3; +} +class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary, + Domain d = GenericDomain> + : X86Inst<o, f, Imm8, outs, ins, asm, itin, d> { + let Pattern = pattern; + let CodeSize = 3; +} +class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : X86Inst<o, f, Imm8PCRel, outs, ins, asm, itin> { + let Pattern = pattern; + let CodeSize = 3; +} +class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : X86Inst<o, f, Imm16, outs, ins, asm, itin> { + let Pattern = pattern; + let CodeSize = 3; +} +class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : X86Inst<o, f, Imm32, outs, ins, asm, itin> { + let Pattern = pattern; + let CodeSize = 3; +} + +class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : X86Inst<o, f, Imm16PCRel, outs, ins, asm, itin> { + let Pattern = pattern; + let CodeSize = 3; +} + +class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : X86Inst<o, f, Imm32PCRel, outs, ins, asm, itin> { + let Pattern = pattern; + let CodeSize = 3; +} + +// FPStack Instruction Templates: +// FPI - Floating Point Instruction template. +class FPI<bits<8> o, Format F, dag outs, dag ins, string asm, + InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, [], itin> {} + +// FpI_ - Floating Point Pseudo Instruction template. Not Predicated. +class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern, + InstrItinClass itin = NoItinerary> + : X86Inst<0, Pseudo, NoImm, outs, ins, "", itin> { + let FPForm = fp; + let Pattern = pattern; +} + +// Templates for instructions that use a 16- or 32-bit segmented address as +// their only operand: lcall (FAR CALL) and ljmp (FAR JMP) +// +// Iseg16 - 16-bit segment selector, 16-bit offset +// Iseg32 - 16-bit segment selector, 32-bit offset + +class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : X86Inst<o, f, Imm16, outs, ins, asm, itin> { + let Pattern = pattern; + let CodeSize = 3; +} + +class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : X86Inst<o, f, Imm32, outs, ins, asm, itin> { + let Pattern = pattern; + let CodeSize = 3; +} + +def __xs : XS; + +// SI - SSE 1 & 2 scalar instructions +class SI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin> { + let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX], + !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2])); + + // AVX instructions have a 'v' prefix in the mnemonic + let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm); +} + +// SIi8 - SSE 1 & 2 scalar instructions +class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin> { + let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX], + !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2])); + + // AVX instructions have a 'v' prefix in the mnemonic + let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm); +} + +// PI - SSE 1 & 2 packed instructions +class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, + InstrItinClass itin, Domain d> + : I<o, F, outs, ins, asm, pattern, itin, d> { + let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX], + !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1])); + + // AVX instructions have a 'v' prefix in the mnemonic + let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm); +} + +// MMXPI - SSE 1 & 2 packed instructions with MMX operands +class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, + InstrItinClass itin, Domain d> + : I<o, F, outs, ins, asm, pattern, itin, d> { + let Predicates = !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1]); +} + +// PIi8 - SSE 1 & 2 packed instructions with immediate +class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin, Domain d> + : Ii8<o, F, outs, ins, asm, pattern, itin, d> { + let Predicates = !if(hasVEX_4VPrefix /* VEX */, [HasAVX], + !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1])); + + // AVX instructions have a 'v' prefix in the mnemonic + let AsmString = !if(hasVEX_4VPrefix, !strconcat("v", asm), asm); +} + +// SSE1 Instruction Templates: +// +// SSI - SSE1 instructions with XS prefix. +// PSI - SSE1 instructions with TB prefix. +// PSIi8 - SSE1 instructions with ImmT == Imm8 and TB prefix. +// VSSI - SSE1 instructions with XS prefix in AVX form. +// VPSI - SSE1 instructions with TB prefix in AVX form. + +class SSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>; +class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>; +class PSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB, + Requires<[UseSSE1]>; +class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB, + Requires<[UseSSE1]>; +class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS, + Requires<[HasAVX]>; +class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedSingle>, TB, + Requires<[HasAVX]>; + +// SSE2 Instruction Templates: +// +// SDI - SSE2 instructions with XD prefix. +// SDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix. +// S2SI - SSE2 instructions with XS prefix. +// SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix. +// PDI - SSE2 instructions with TB and OpSize prefixes. +// PDIi8 - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes. +// VSDI - SSE2 instructions with XD prefix in AVX form. +// VPDI - SSE2 instructions with TB and OpSize prefixes in AVX form. +// MMXSDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix as well as +// MMX operands. +// MMXSSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix as well as +// MMX operands. + +class SDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>; +class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>; +class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE2]>; +class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>; +class PDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize, + Requires<[UseSSE2]>; +class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize, + Requires<[UseSSE2]>; +class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD, + Requires<[HasAVX]>; +class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS, + Requires<[HasAVX]>; +class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>, TB, + OpSize, Requires<[HasAVX]>; +class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>; +class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>; + +// SSE3 Instruction Templates: +// +// S3I - SSE3 instructions with TB and OpSize prefixes. +// S3SI - SSE3 instructions with XS prefix. +// S3DI - SSE3 instructions with XD prefix. + +class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS, + Requires<[UseSSE3]>; +class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD, + Requires<[UseSSE3]>; +class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize, + Requires<[UseSSE3]>; + + +// SSSE3 Instruction Templates: +// +// SS38I - SSSE3 instructions with T8 prefix. +// SS3AI - SSSE3 instructions with TA prefix. +// MMXSS38I - SSSE3 instructions with T8 prefix and MMX operands. +// MMXSS3AI - SSSE3 instructions with TA prefix and MMX operands. +// +// Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version +// uses the MMX registers. The 64-bit versions are grouped with the MMX +// classes. They need to be enabled even if AVX is enabled. + +class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, + Requires<[UseSSSE3]>; +class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, + Requires<[UseSSSE3]>; +class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, + Requires<[HasSSSE3]>; +class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, + Requires<[HasSSSE3]>; + +// SSE4.1 Instruction Templates: +// +// SS48I - SSE 4.1 instructions with T8 prefix. +// SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8. +// +class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, + Requires<[UseSSE41]>; +class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, + Requires<[UseSSE41]>; + +// SSE4.2 Instruction Templates: +// +// SS428I - SSE 4.2 instructions with T8 prefix. +class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, + Requires<[UseSSE42]>; + +// SS42FI - SSE 4.2 instructions with T8XD prefix. +// NOTE: 'HasSSE42' is used as SS42FI is only used for CRC32 insns. +class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, T8XD, Requires<[HasSSE42]>; + +// SS42AI = SSE 4.2 instructions with TA prefix +class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, + Requires<[UseSSE42]>; + +// AVX Instruction Templates: +// Instructions introduced in AVX (no SSE equivalent forms) +// +// AVX8I - AVX instructions with T8 and OpSize prefix. +// AVXAIi8 - AVX instructions with TA, OpSize prefix and ImmT = Imm8. +class AVX8I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, OpSize, + Requires<[HasAVX]>; +class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize, + Requires<[HasAVX]>; + +// AVX2 Instruction Templates: +// Instructions introduced in AVX2 (no SSE equivalent forms) +// +// AVX28I - AVX2 instructions with T8 and OpSize prefix. +// AVX2AIi8 - AVX2 instructions with TA, OpSize prefix and ImmT = Imm8. +class AVX28I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, OpSize, + Requires<[HasAVX2]>; +class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize, + Requires<[HasAVX2]>; + +// AES Instruction Templates: +// +// AES8I +// These use the same encoding as the SSE4.2 T8 and TA encodings. +class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, + Requires<[HasAES]>; + +class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, + Requires<[HasAES]>; + +// PCLMUL Instruction Templates +class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, + OpSize, Requires<[HasPCLMUL]>; + +class AVXPCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, + OpSize, VEX_4V, Requires<[HasAVX, HasPCLMUL]>; + +// FMA3 Instruction Templates +class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, T8, + OpSize, VEX_4V, FMASC, Requires<[HasFMA]>; + +// FMA4 Instruction Templates +class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin>, TA, + OpSize, VEX_4V, VEX_I8IMM, FMASC, Requires<[HasFMA4]>; + +// XOP 2, 3 and 4 Operand Instruction Template +class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, + XOP, XOP9, Requires<[HasXOP]>; + +// XOP 2, 3 and 4 Operand Instruction Templates with imm byte +class IXOPi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, + XOP, XOP8, Requires<[HasXOP]>; + +// XOP 5 operand instruction (VEX encoding!) +class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, + OpSize, VEX_4V, VEX_I8IMM, Requires<[HasXOP]>; + +// X86-64 Instruction templates... +// + +class RI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, REX_W; +class RIi8 <bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin>, REX_W; +class RIi32 <bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii32<o, F, outs, ins, asm, pattern, itin>, REX_W; + +class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : X86Inst<o, f, Imm64, outs, ins, asm, itin>, REX_W { + let Pattern = pattern; + let CodeSize = 3; +} + +class RSSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : SSI<o, F, outs, ins, asm, pattern, itin>, REX_W; +class RSDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : SDI<o, F, outs, ins, asm, pattern, itin>, REX_W; +class RPDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : PDI<o, F, outs, ins, asm, pattern, itin>, REX_W; +class VRPDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : VPDI<o, F, outs, ins, asm, pattern, itin>, VEX_W; + +// MMX Instruction templates +// + +// MMXI - MMX instructions with TB prefix. +// MMXI64 - MMX instructions with TB prefix valid only in 64 bit mode. +// MMX2I - MMX / SSE2 instructions with TB and OpSize prefixes. +// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix. +// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix. +// MMXID - MMX instructions with XD prefix. +// MMXIS - MMX instructions with XS prefix. +class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX]>; +class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX,In64BitMode]>; +class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, TB, REX_W, Requires<[HasMMX]>; +class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, TB, OpSize, Requires<[HasMMX]>; +class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX]>; +class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasMMX]>; +class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasMMX]>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td new file mode 100644 index 0000000..2a72fb6 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -0,0 +1,432 @@ +//===-- X86InstrFragmentsSIMD.td - x86 SIMD ISA ------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides pattern fragments useful for SIMD instructions. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// MMX Pattern Fragments +//===----------------------------------------------------------------------===// + +def load_mmx : PatFrag<(ops node:$ptr), (x86mmx (load node:$ptr))>; +def bc_mmx : PatFrag<(ops node:$in), (x86mmx (bitconvert node:$in))>; + +//===----------------------------------------------------------------------===// +// SSE specific DAG Nodes. +//===----------------------------------------------------------------------===// + +def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>, + SDTCisFP<0>, SDTCisInt<2> ]>; +def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, + SDTCisFP<1>, SDTCisVT<3, i8>]>; + +def X86umin : SDNode<"X86ISD::UMIN", SDTIntBinOp>; +def X86umax : SDNode<"X86ISD::UMAX", SDTIntBinOp>; +def X86smin : SDNode<"X86ISD::SMIN", SDTIntBinOp>; +def X86smax : SDNode<"X86ISD::SMAX", SDTIntBinOp>; + +def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>; +def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>; + +// Commutative and Associative FMIN and FMAX. +def X86fminc : SDNode<"X86ISD::FMINC", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; +def X86fmaxc : SDNode<"X86ISD::FMAXC", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; + +def X86fand : SDNode<"X86ISD::FAND", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; +def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; +def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; +def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>; +def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>; +def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>; +def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>; +def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>; +def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>; +def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>; +def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>; +def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; +def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; +def X86cmpss : SDNode<"X86ISD::FSETCCss", SDTX86Cmpss>; +def X86cmpsd : SDNode<"X86ISD::FSETCCsd", SDTX86Cmpsd>; +def X86pshufb : SDNode<"X86ISD::PSHUFB", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86andnp : SDNode<"X86ISD::ANDNP", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86psign : SDNode<"X86ISD::PSIGN", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86pextrb : SDNode<"X86ISD::PEXTRB", + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>; +def X86pextrw : SDNode<"X86ISD::PEXTRW", + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>; +def X86pinsrb : SDNode<"X86ISD::PINSRB", + SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, + SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>; +def X86pinsrw : SDNode<"X86ISD::PINSRW", + SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>, + SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>; +def X86insrtps : SDNode<"X86ISD::INSERTPS", + SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>, + SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>; +def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL", + SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>; + +def X86vzmovly : SDNode<"X86ISD::VZEXT_MOVL", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisOpSmallerThanOp<1, 0> ]>>; + +def X86vsmovl : SDNode<"X86ISD::VSEXT_MOVL", + SDTypeProfile<1, 1, + [SDTCisVec<0>, SDTCisInt<1>, SDTCisInt<0>]>>; + +def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +def X86vzext : SDNode<"X86ISD::VZEXT", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCisInt<1>]>>; + +def X86vsext : SDNode<"X86ISD::VSEXT", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCisInt<1>]>>; + +def X86vfpext : SDNode<"X86ISD::VFPEXT", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<0>, SDTCisFP<1>]>>; +def X86vfpround: SDNode<"X86ISD::VFPROUND", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<0>, SDTCisFP<1>]>>; + +def X86vshldq : SDNode<"X86ISD::VSHLDQ", SDTIntShiftOp>; +def X86vshrdq : SDNode<"X86ISD::VSRLDQ", SDTIntShiftOp>; +def X86cmpp : SDNode<"X86ISD::CMPP", SDTX86VFCMP>; +def X86pcmpeq : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>; +def X86pcmpgt : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>; + +def X86vshl : SDNode<"X86ISD::VSHL", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisVec<2>]>>; +def X86vsrl : SDNode<"X86ISD::VSRL", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisVec<2>]>>; +def X86vsra : SDNode<"X86ISD::VSRA", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisVec<2>]>>; + +def X86vshli : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>; +def X86vsrli : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>; +def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>; + +def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, + SDTCisVec<1>, + SDTCisSameAs<2, 1>]>; +def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>; +def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; +def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; + +def X86pmuludq : SDNode<"X86ISD::PMULUDQ", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisSameAs<1,2>]>>; + +// Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get +// translated into one of the target nodes below during lowering. +// Note: this is a work in progress... +def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; +def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>; + +def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>, + SDTCisSameAs<0,1>, SDTCisInt<2>]>; +def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, SDTCisInt<3>]>; + +def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; +def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>; + +def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>, + SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>; + +def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>; + +def X86PShufd : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>; +def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>; +def X86PShuflw : SDNode<"X86ISD::PSHUFLW", SDTShuff2OpI>; + +def X86Shufp : SDNode<"X86ISD::SHUFP", SDTShuff3OpI>; + +def X86Movddup : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>; +def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>; +def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>; + +def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2Op>; +def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2Op>; + +def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2Op>; +def X86Movlhpd : SDNode<"X86ISD::MOVLHPD", SDTShuff2Op>; +def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>; + +def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>; +def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>; + +def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>; +def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>; + +def X86VPermilp : SDNode<"X86ISD::VPERMILP", SDTShuff2OpI>; +def X86VPermv : SDNode<"X86ISD::VPERMV", SDTShuff2Op>; +def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>; + +def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; + +def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; + +def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; +def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; +def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>; +def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFma>; +def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFma>; +def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFma>; +def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFma>; + +def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>, + SDTCisVT<4, i8>]>; +def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, v16i8>, SDTCisVT<3, i32>, + SDTCisVT<4, v16i8>, SDTCisVT<5, i32>, + SDTCisVT<6, i8>]>; + +def X86pcmpistri : SDNode<"X86ISD::PCMPISTRI", SDT_PCMPISTRI>; +def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>; + +//===----------------------------------------------------------------------===// +// SSE Complex Patterns +//===----------------------------------------------------------------------===// + +// These are 'extloads' from a scalar to the low element of a vector, zeroing +// the top elements. These are used for the SSE 'ss' and 'sd' instruction +// forms. +def sse_load_f32 : ComplexPattern<v4f32, 5, "SelectScalarSSELoad", [], + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, + SDNPWantRoot]>; +def sse_load_f64 : ComplexPattern<v2f64, 5, "SelectScalarSSELoad", [], + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, + SDNPWantRoot]>; + +def ssmem : Operand<v4f32> { + let PrintMethod = "printf32mem"; + let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm); + let ParserMatchClass = X86MemAsmOperand; + let OperandType = "OPERAND_MEMORY"; +} +def sdmem : Operand<v2f64> { + let PrintMethod = "printf64mem"; + let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm); + let ParserMatchClass = X86MemAsmOperand; + let OperandType = "OPERAND_MEMORY"; +} + +//===----------------------------------------------------------------------===// +// SSE pattern fragments +//===----------------------------------------------------------------------===// + +// 128-bit load pattern fragments +// NOTE: all 128-bit integer vector loads are promoted to v2i64 +def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>; +def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>; +def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>; + +// 256-bit load pattern fragments +// NOTE: all 256-bit integer vector loads are promoted to v4i64 +def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>; +def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>; +def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>; + +// 128-/256-bit extload pattern fragments +def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>; +def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>; + +// Like 'store', but always requires 128-bit vector alignment. +def alignedstore : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAlignment() >= 16; +}]>; + +// Like 'store', but always requires 256-bit vector alignment. +def alignedstore256 : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAlignment() >= 32; +}]>; + +// Like 'load', but always requires 128-bit vector alignment. +def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() >= 16; +}]>; + +// Like 'X86vzload', but always requires 128-bit vector alignment. +def alignedX86vzload : PatFrag<(ops node:$ptr), (X86vzload node:$ptr), [{ + return cast<MemSDNode>(N)->getAlignment() >= 16; +}]>; + +// Like 'load', but always requires 256-bit vector alignment. +def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() >= 32; +}]>; + +def alignedloadfsf32 : PatFrag<(ops node:$ptr), + (f32 (alignedload node:$ptr))>; +def alignedloadfsf64 : PatFrag<(ops node:$ptr), + (f64 (alignedload node:$ptr))>; + +// 128-bit aligned load pattern fragments +// NOTE: all 128-bit integer vector loads are promoted to v2i64 +def alignedloadv4f32 : PatFrag<(ops node:$ptr), + (v4f32 (alignedload node:$ptr))>; +def alignedloadv2f64 : PatFrag<(ops node:$ptr), + (v2f64 (alignedload node:$ptr))>; +def alignedloadv2i64 : PatFrag<(ops node:$ptr), + (v2i64 (alignedload node:$ptr))>; + +// 256-bit aligned load pattern fragments +// NOTE: all 256-bit integer vector loads are promoted to v4i64 +def alignedloadv8f32 : PatFrag<(ops node:$ptr), + (v8f32 (alignedload256 node:$ptr))>; +def alignedloadv4f64 : PatFrag<(ops node:$ptr), + (v4f64 (alignedload256 node:$ptr))>; +def alignedloadv4i64 : PatFrag<(ops node:$ptr), + (v4i64 (alignedload256 node:$ptr))>; + +// Like 'load', but uses special alignment checks suitable for use in +// memory operands in most SSE instructions, which are required to +// be naturally aligned on some targets but not on others. If the subtarget +// allows unaligned accesses, match any load, though this may require +// setting a feature bit in the processor (on startup, for example). +// Opteron 10h and later implement such a feature. +def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return Subtarget->hasVectorUAMem() + || cast<LoadSDNode>(N)->getAlignment() >= 16; +}]>; + +def memopfsf32 : PatFrag<(ops node:$ptr), (f32 (memop node:$ptr))>; +def memopfsf64 : PatFrag<(ops node:$ptr), (f64 (memop node:$ptr))>; + +// 128-bit memop pattern fragments +// NOTE: all 128-bit integer vector loads are promoted to v2i64 +def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>; +def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>; +def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>; + +// 256-bit memop pattern fragments +// NOTE: all 256-bit integer vector loads are promoted to v4i64 +def memopv8f32 : PatFrag<(ops node:$ptr), (v8f32 (memop node:$ptr))>; +def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>; +def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>; + +// SSSE3 uses MMX registers for some instructions. They aren't aligned on a +// 16-byte boundary. +// FIXME: 8 byte alignment for mmx reads is not required +def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() >= 8; +}]>; + +def memopmmx : PatFrag<(ops node:$ptr), (x86mmx (memop64 node:$ptr))>; + +// MOVNT Support +// Like 'store', but requires the non-temporal bit to be set +def nontemporalstore : PatFrag<(ops node:$val, node:$ptr), + (st node:$val, node:$ptr), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) + return ST->isNonTemporal(); + return false; +}]>; + +def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), + (st node:$val, node:$ptr), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) + return ST->isNonTemporal() && !ST->isTruncatingStore() && + ST->getAddressingMode() == ISD::UNINDEXED && + ST->getAlignment() >= 16; + return false; +}]>; + +def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), + (st node:$val, node:$ptr), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) + return ST->isNonTemporal() && + ST->getAlignment() < 16; + return false; +}]>; + +// 128-bit bitconvert pattern fragments +def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>; +def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>; +def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>; +def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>; +def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>; +def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>; + +// 256-bit bitconvert pattern fragments +def bc_v32i8 : PatFrag<(ops node:$in), (v32i8 (bitconvert node:$in))>; +def bc_v16i16 : PatFrag<(ops node:$in), (v16i16 (bitconvert node:$in))>; +def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>; +def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>; + +def vzmovl_v2i64 : PatFrag<(ops node:$src), + (bitconvert (v2i64 (X86vzmovl + (v2i64 (scalar_to_vector (loadi64 node:$src))))))>; +def vzmovl_v4i32 : PatFrag<(ops node:$src), + (bitconvert (v4i32 (X86vzmovl + (v4i32 (scalar_to_vector (loadi32 node:$src))))))>; + +def vzload_v2i64 : PatFrag<(ops node:$src), + (bitconvert (v2i64 (X86vzload node:$src)))>; + + +def fp32imm0 : PatLeaf<(f32 fpimm), [{ + return N->isExactlyValue(+0.0); +}]>; + +// BYTE_imm - Transform bit immediates into byte immediates. +def BYTE_imm : SDNodeXForm<imm, [{ + // Transformation function: imm >> 3 + return getI32Imm(N->getZExtValue() >> 3); +}]>; + +// EXTRACT_get_vextractf128_imm xform function: convert extract_subvector index +// to VEXTRACTF128 imm. +def EXTRACT_get_vextractf128_imm : SDNodeXForm<extract_subvector, [{ + return getI8Imm(X86::getExtractVEXTRACTF128Immediate(N)); +}]>; + +// INSERT_get_vinsertf128_imm xform function: convert insert_subvector index to +// VINSERTF128 imm. +def INSERT_get_vinsertf128_imm : SDNodeXForm<insert_subvector, [{ + return getI8Imm(X86::getInsertVINSERTF128Immediate(N)); +}]>; + +def vextractf128_extract : PatFrag<(ops node:$bigvec, node:$index), + (extract_subvector node:$bigvec, + node:$index), [{ + return X86::isVEXTRACTF128Index(N); +}], EXTRACT_get_vextractf128_imm>; + +def vinsertf128_insert : PatFrag<(ops node:$bigvec, node:$smallvec, + node:$index), + (insert_subvector node:$bigvec, node:$smallvec, + node:$index), [{ + return X86::isVINSERTF128Index(N); +}], INSERT_get_vinsertf128_imm>; + diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp new file mode 100644 index 0000000..7c0423f --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -0,0 +1,4896 @@ +//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "X86InstrInfo.h" +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86MachineFunctionInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOptions.h" +#include <limits> + +#define GET_INSTRINFO_CTOR +#include "X86GenInstrInfo.inc" + +using namespace llvm; + +static cl::opt<bool> +NoFusing("disable-spill-fusing", + cl::desc("Disable fusing of spill code into instructions")); +static cl::opt<bool> +PrintFailedFusing("print-failed-fuse-candidates", + cl::desc("Print instructions that the allocator wants to" + " fuse, but the X86 backend currently can't"), + cl::Hidden); +static cl::opt<bool> +ReMatPICStubLoad("remat-pic-stub-load", + cl::desc("Re-materialize load from stub in PIC mode"), + cl::init(false), cl::Hidden); + +enum { + // Select which memory operand is being unfolded. + // (stored in bits 0 - 3) + TB_INDEX_0 = 0, + TB_INDEX_1 = 1, + TB_INDEX_2 = 2, + TB_INDEX_3 = 3, + TB_INDEX_MASK = 0xf, + + // Do not insert the reverse map (MemOp -> RegOp) into the table. + // This may be needed because there is a many -> one mapping. + TB_NO_REVERSE = 1 << 4, + + // Do not insert the forward map (RegOp -> MemOp) into the table. + // This is needed for Native Client, which prohibits branch + // instructions from using a memory operand. + TB_NO_FORWARD = 1 << 5, + + TB_FOLDED_LOAD = 1 << 6, + TB_FOLDED_STORE = 1 << 7, + + // Minimum alignment required for load/store. + // Used for RegOp->MemOp conversion. + // (stored in bits 8 - 15) + TB_ALIGN_SHIFT = 8, + TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT, + TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT, + TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT, + TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT +}; + +struct X86OpTblEntry { + uint16_t RegOp; + uint16_t MemOp; + uint16_t Flags; +}; + +X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) + : X86GenInstrInfo((tm.getSubtarget<X86Subtarget>().is64Bit() + ? X86::ADJCALLSTACKDOWN64 + : X86::ADJCALLSTACKDOWN32), + (tm.getSubtarget<X86Subtarget>().is64Bit() + ? X86::ADJCALLSTACKUP64 + : X86::ADJCALLSTACKUP32)), + TM(tm), RI(tm, *this) { + + static const X86OpTblEntry OpTbl2Addr[] = { + { X86::ADC32ri, X86::ADC32mi, 0 }, + { X86::ADC32ri8, X86::ADC32mi8, 0 }, + { X86::ADC32rr, X86::ADC32mr, 0 }, + { X86::ADC64ri32, X86::ADC64mi32, 0 }, + { X86::ADC64ri8, X86::ADC64mi8, 0 }, + { X86::ADC64rr, X86::ADC64mr, 0 }, + { X86::ADD16ri, X86::ADD16mi, 0 }, + { X86::ADD16ri8, X86::ADD16mi8, 0 }, + { X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE }, + { X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE }, + { X86::ADD16rr, X86::ADD16mr, 0 }, + { X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE }, + { X86::ADD32ri, X86::ADD32mi, 0 }, + { X86::ADD32ri8, X86::ADD32mi8, 0 }, + { X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE }, + { X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE }, + { X86::ADD32rr, X86::ADD32mr, 0 }, + { X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE }, + { X86::ADD64ri32, X86::ADD64mi32, 0 }, + { X86::ADD64ri8, X86::ADD64mi8, 0 }, + { X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE }, + { X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE }, + { X86::ADD64rr, X86::ADD64mr, 0 }, + { X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE }, + { X86::ADD8ri, X86::ADD8mi, 0 }, + { X86::ADD8rr, X86::ADD8mr, 0 }, + { X86::AND16ri, X86::AND16mi, 0 }, + { X86::AND16ri8, X86::AND16mi8, 0 }, + { X86::AND16rr, X86::AND16mr, 0 }, + { X86::AND32ri, X86::AND32mi, 0 }, + { X86::AND32ri8, X86::AND32mi8, 0 }, + { X86::AND32rr, X86::AND32mr, 0 }, + { X86::AND64ri32, X86::AND64mi32, 0 }, + { X86::AND64ri8, X86::AND64mi8, 0 }, + { X86::AND64rr, X86::AND64mr, 0 }, + { X86::AND8ri, X86::AND8mi, 0 }, + { X86::AND8rr, X86::AND8mr, 0 }, + { X86::DEC16r, X86::DEC16m, 0 }, + { X86::DEC32r, X86::DEC32m, 0 }, + { X86::DEC64_16r, X86::DEC64_16m, 0 }, + { X86::DEC64_32r, X86::DEC64_32m, 0 }, + { X86::DEC64r, X86::DEC64m, 0 }, + { X86::DEC8r, X86::DEC8m, 0 }, + { X86::INC16r, X86::INC16m, 0 }, + { X86::INC32r, X86::INC32m, 0 }, + { X86::INC64_16r, X86::INC64_16m, 0 }, + { X86::INC64_32r, X86::INC64_32m, 0 }, + { X86::INC64r, X86::INC64m, 0 }, + { X86::INC8r, X86::INC8m, 0 }, + { X86::NEG16r, X86::NEG16m, 0 }, + { X86::NEG32r, X86::NEG32m, 0 }, + { X86::NEG64r, X86::NEG64m, 0 }, + { X86::NEG8r, X86::NEG8m, 0 }, + { X86::NOT16r, X86::NOT16m, 0 }, + { X86::NOT32r, X86::NOT32m, 0 }, + { X86::NOT64r, X86::NOT64m, 0 }, + { X86::NOT8r, X86::NOT8m, 0 }, + { X86::OR16ri, X86::OR16mi, 0 }, + { X86::OR16ri8, X86::OR16mi8, 0 }, + { X86::OR16rr, X86::OR16mr, 0 }, + { X86::OR32ri, X86::OR32mi, 0 }, + { X86::OR32ri8, X86::OR32mi8, 0 }, + { X86::OR32rr, X86::OR32mr, 0 }, + { X86::OR64ri32, X86::OR64mi32, 0 }, + { X86::OR64ri8, X86::OR64mi8, 0 }, + { X86::OR64rr, X86::OR64mr, 0 }, + { X86::OR8ri, X86::OR8mi, 0 }, + { X86::OR8rr, X86::OR8mr, 0 }, + { X86::ROL16r1, X86::ROL16m1, 0 }, + { X86::ROL16rCL, X86::ROL16mCL, 0 }, + { X86::ROL16ri, X86::ROL16mi, 0 }, + { X86::ROL32r1, X86::ROL32m1, 0 }, + { X86::ROL32rCL, X86::ROL32mCL, 0 }, + { X86::ROL32ri, X86::ROL32mi, 0 }, + { X86::ROL64r1, X86::ROL64m1, 0 }, + { X86::ROL64rCL, X86::ROL64mCL, 0 }, + { X86::ROL64ri, X86::ROL64mi, 0 }, + { X86::ROL8r1, X86::ROL8m1, 0 }, + { X86::ROL8rCL, X86::ROL8mCL, 0 }, + { X86::ROL8ri, X86::ROL8mi, 0 }, + { X86::ROR16r1, X86::ROR16m1, 0 }, + { X86::ROR16rCL, X86::ROR16mCL, 0 }, + { X86::ROR16ri, X86::ROR16mi, 0 }, + { X86::ROR32r1, X86::ROR32m1, 0 }, + { X86::ROR32rCL, X86::ROR32mCL, 0 }, + { X86::ROR32ri, X86::ROR32mi, 0 }, + { X86::ROR64r1, X86::ROR64m1, 0 }, + { X86::ROR64rCL, X86::ROR64mCL, 0 }, + { X86::ROR64ri, X86::ROR64mi, 0 }, + { X86::ROR8r1, X86::ROR8m1, 0 }, + { X86::ROR8rCL, X86::ROR8mCL, 0 }, + { X86::ROR8ri, X86::ROR8mi, 0 }, + { X86::SAR16r1, X86::SAR16m1, 0 }, + { X86::SAR16rCL, X86::SAR16mCL, 0 }, + { X86::SAR16ri, X86::SAR16mi, 0 }, + { X86::SAR32r1, X86::SAR32m1, 0 }, + { X86::SAR32rCL, X86::SAR32mCL, 0 }, + { X86::SAR32ri, X86::SAR32mi, 0 }, + { X86::SAR64r1, X86::SAR64m1, 0 }, + { X86::SAR64rCL, X86::SAR64mCL, 0 }, + { X86::SAR64ri, X86::SAR64mi, 0 }, + { X86::SAR8r1, X86::SAR8m1, 0 }, + { X86::SAR8rCL, X86::SAR8mCL, 0 }, + { X86::SAR8ri, X86::SAR8mi, 0 }, + { X86::SBB32ri, X86::SBB32mi, 0 }, + { X86::SBB32ri8, X86::SBB32mi8, 0 }, + { X86::SBB32rr, X86::SBB32mr, 0 }, + { X86::SBB64ri32, X86::SBB64mi32, 0 }, + { X86::SBB64ri8, X86::SBB64mi8, 0 }, + { X86::SBB64rr, X86::SBB64mr, 0 }, + { X86::SHL16rCL, X86::SHL16mCL, 0 }, + { X86::SHL16ri, X86::SHL16mi, 0 }, + { X86::SHL32rCL, X86::SHL32mCL, 0 }, + { X86::SHL32ri, X86::SHL32mi, 0 }, + { X86::SHL64rCL, X86::SHL64mCL, 0 }, + { X86::SHL64ri, X86::SHL64mi, 0 }, + { X86::SHL8rCL, X86::SHL8mCL, 0 }, + { X86::SHL8ri, X86::SHL8mi, 0 }, + { X86::SHLD16rrCL, X86::SHLD16mrCL, 0 }, + { X86::SHLD16rri8, X86::SHLD16mri8, 0 }, + { X86::SHLD32rrCL, X86::SHLD32mrCL, 0 }, + { X86::SHLD32rri8, X86::SHLD32mri8, 0 }, + { X86::SHLD64rrCL, X86::SHLD64mrCL, 0 }, + { X86::SHLD64rri8, X86::SHLD64mri8, 0 }, + { X86::SHR16r1, X86::SHR16m1, 0 }, + { X86::SHR16rCL, X86::SHR16mCL, 0 }, + { X86::SHR16ri, X86::SHR16mi, 0 }, + { X86::SHR32r1, X86::SHR32m1, 0 }, + { X86::SHR32rCL, X86::SHR32mCL, 0 }, + { X86::SHR32ri, X86::SHR32mi, 0 }, + { X86::SHR64r1, X86::SHR64m1, 0 }, + { X86::SHR64rCL, X86::SHR64mCL, 0 }, + { X86::SHR64ri, X86::SHR64mi, 0 }, + { X86::SHR8r1, X86::SHR8m1, 0 }, + { X86::SHR8rCL, X86::SHR8mCL, 0 }, + { X86::SHR8ri, X86::SHR8mi, 0 }, + { X86::SHRD16rrCL, X86::SHRD16mrCL, 0 }, + { X86::SHRD16rri8, X86::SHRD16mri8, 0 }, + { X86::SHRD32rrCL, X86::SHRD32mrCL, 0 }, + { X86::SHRD32rri8, X86::SHRD32mri8, 0 }, + { X86::SHRD64rrCL, X86::SHRD64mrCL, 0 }, + { X86::SHRD64rri8, X86::SHRD64mri8, 0 }, + { X86::SUB16ri, X86::SUB16mi, 0 }, + { X86::SUB16ri8, X86::SUB16mi8, 0 }, + { X86::SUB16rr, X86::SUB16mr, 0 }, + { X86::SUB32ri, X86::SUB32mi, 0 }, + { X86::SUB32ri8, X86::SUB32mi8, 0 }, + { X86::SUB32rr, X86::SUB32mr, 0 }, + { X86::SUB64ri32, X86::SUB64mi32, 0 }, + { X86::SUB64ri8, X86::SUB64mi8, 0 }, + { X86::SUB64rr, X86::SUB64mr, 0 }, + { X86::SUB8ri, X86::SUB8mi, 0 }, + { X86::SUB8rr, X86::SUB8mr, 0 }, + { X86::XOR16ri, X86::XOR16mi, 0 }, + { X86::XOR16ri8, X86::XOR16mi8, 0 }, + { X86::XOR16rr, X86::XOR16mr, 0 }, + { X86::XOR32ri, X86::XOR32mi, 0 }, + { X86::XOR32ri8, X86::XOR32mi8, 0 }, + { X86::XOR32rr, X86::XOR32mr, 0 }, + { X86::XOR64ri32, X86::XOR64mi32, 0 }, + { X86::XOR64ri8, X86::XOR64mi8, 0 }, + { X86::XOR64rr, X86::XOR64mr, 0 }, + { X86::XOR8ri, X86::XOR8mi, 0 }, + { X86::XOR8rr, X86::XOR8mr, 0 } + }; + + for (unsigned i = 0, e = array_lengthof(OpTbl2Addr); i != e; ++i) { + unsigned RegOp = OpTbl2Addr[i].RegOp; + unsigned MemOp = OpTbl2Addr[i].MemOp; + unsigned Flags = OpTbl2Addr[i].Flags; + AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable, + RegOp, MemOp, + // Index 0, folded load and store, no alignment requirement. + Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE); + } + + static const X86OpTblEntry OpTbl0[] = { + { X86::BT16ri8, X86::BT16mi8, TB_FOLDED_LOAD }, + { X86::BT32ri8, X86::BT32mi8, TB_FOLDED_LOAD }, + { X86::BT64ri8, X86::BT64mi8, TB_FOLDED_LOAD }, + { X86::CALL32r, X86::CALL32m, TB_FOLDED_LOAD }, + { X86::CALL64r, X86::CALL64m, TB_FOLDED_LOAD }, + { X86::CMP16ri, X86::CMP16mi, TB_FOLDED_LOAD }, + { X86::CMP16ri8, X86::CMP16mi8, TB_FOLDED_LOAD }, + { X86::CMP16rr, X86::CMP16mr, TB_FOLDED_LOAD }, + { X86::CMP32ri, X86::CMP32mi, TB_FOLDED_LOAD }, + { X86::CMP32ri8, X86::CMP32mi8, TB_FOLDED_LOAD }, + { X86::CMP32rr, X86::CMP32mr, TB_FOLDED_LOAD }, + { X86::CMP64ri32, X86::CMP64mi32, TB_FOLDED_LOAD }, + { X86::CMP64ri8, X86::CMP64mi8, TB_FOLDED_LOAD }, + { X86::CMP64rr, X86::CMP64mr, TB_FOLDED_LOAD }, + { X86::CMP8ri, X86::CMP8mi, TB_FOLDED_LOAD }, + { X86::CMP8rr, X86::CMP8mr, TB_FOLDED_LOAD }, + { X86::DIV16r, X86::DIV16m, TB_FOLDED_LOAD }, + { X86::DIV32r, X86::DIV32m, TB_FOLDED_LOAD }, + { X86::DIV64r, X86::DIV64m, TB_FOLDED_LOAD }, + { X86::DIV8r, X86::DIV8m, TB_FOLDED_LOAD }, + { X86::EXTRACTPSrr, X86::EXTRACTPSmr, TB_FOLDED_STORE }, + { X86::FsMOVAPDrr, X86::MOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::FsMOVAPSrr, X86::MOVSSmr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::IDIV16r, X86::IDIV16m, TB_FOLDED_LOAD }, + { X86::IDIV32r, X86::IDIV32m, TB_FOLDED_LOAD }, + { X86::IDIV64r, X86::IDIV64m, TB_FOLDED_LOAD }, + { X86::IDIV8r, X86::IDIV8m, TB_FOLDED_LOAD }, + { X86::IMUL16r, X86::IMUL16m, TB_FOLDED_LOAD }, + { X86::IMUL32r, X86::IMUL32m, TB_FOLDED_LOAD }, + { X86::IMUL64r, X86::IMUL64m, TB_FOLDED_LOAD }, + { X86::IMUL8r, X86::IMUL8m, TB_FOLDED_LOAD }, + { X86::JMP32r, X86::JMP32m, TB_FOLDED_LOAD }, + { X86::JMP64r, X86::JMP64m, TB_FOLDED_LOAD }, + { X86::MOV16ri, X86::MOV16mi, TB_FOLDED_STORE }, + { X86::MOV16rr, X86::MOV16mr, TB_FOLDED_STORE }, + { X86::MOV32ri, X86::MOV32mi, TB_FOLDED_STORE }, + { X86::MOV32rr, X86::MOV32mr, TB_FOLDED_STORE }, + { X86::MOV64ri32, X86::MOV64mi32, TB_FOLDED_STORE }, + { X86::MOV64rr, X86::MOV64mr, TB_FOLDED_STORE }, + { X86::MOV8ri, X86::MOV8mi, TB_FOLDED_STORE }, + { X86::MOV8rr, X86::MOV8mr, TB_FOLDED_STORE }, + { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE }, + { X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE }, + { X86::MOVPQIto64rr,X86::MOVPQI2QImr, TB_FOLDED_STORE }, + { X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE }, + { X86::MOVSS2DIrr, X86::MOVSS2DImr, TB_FOLDED_STORE }, + { X86::MOVUPDrr, X86::MOVUPDmr, TB_FOLDED_STORE }, + { X86::MOVUPSrr, X86::MOVUPSmr, TB_FOLDED_STORE }, + { X86::MUL16r, X86::MUL16m, TB_FOLDED_LOAD }, + { X86::MUL32r, X86::MUL32m, TB_FOLDED_LOAD }, + { X86::MUL64r, X86::MUL64m, TB_FOLDED_LOAD }, + { X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD }, + { X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE }, + { X86::SETAr, X86::SETAm, TB_FOLDED_STORE }, + { X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE }, + { X86::SETBr, X86::SETBm, TB_FOLDED_STORE }, + { X86::SETEr, X86::SETEm, TB_FOLDED_STORE }, + { X86::SETGEr, X86::SETGEm, TB_FOLDED_STORE }, + { X86::SETGr, X86::SETGm, TB_FOLDED_STORE }, + { X86::SETLEr, X86::SETLEm, TB_FOLDED_STORE }, + { X86::SETLr, X86::SETLm, TB_FOLDED_STORE }, + { X86::SETNEr, X86::SETNEm, TB_FOLDED_STORE }, + { X86::SETNOr, X86::SETNOm, TB_FOLDED_STORE }, + { X86::SETNPr, X86::SETNPm, TB_FOLDED_STORE }, + { X86::SETNSr, X86::SETNSm, TB_FOLDED_STORE }, + { X86::SETOr, X86::SETOm, TB_FOLDED_STORE }, + { X86::SETPr, X86::SETPm, TB_FOLDED_STORE }, + { X86::SETSr, X86::SETSm, TB_FOLDED_STORE }, + { X86::TAILJMPr, X86::TAILJMPm, TB_FOLDED_LOAD }, + { X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD }, + { X86::TEST16ri, X86::TEST16mi, TB_FOLDED_LOAD }, + { X86::TEST32ri, X86::TEST32mi, TB_FOLDED_LOAD }, + { X86::TEST64ri32, X86::TEST64mi32, TB_FOLDED_LOAD }, + { X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD }, + // AVX 128-bit versions of foldable instructions + { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE }, + { X86::FsVMOVAPDrr, X86::VMOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::FsVMOVAPSrr, X86::VMOVSSmr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVAPDrr, X86::VMOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr, TB_FOLDED_STORE }, + { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE }, + { X86::VMOVSDto64rr,X86::VMOVSDto64mr, TB_FOLDED_STORE }, + { X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE }, + { X86::VMOVUPDrr, X86::VMOVUPDmr, TB_FOLDED_STORE }, + { X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE }, + // AVX 256-bit foldable instructions + { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, + { X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, + { X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, + { X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE }, + { X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE } + }; + + for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) { + unsigned RegOp = OpTbl0[i].RegOp; + unsigned MemOp = OpTbl0[i].MemOp; + unsigned Flags = OpTbl0[i].Flags; + AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable, + RegOp, MemOp, TB_INDEX_0 | Flags); + } + + static const X86OpTblEntry OpTbl1[] = { + { X86::CMP16rr, X86::CMP16rm, 0 }, + { X86::CMP32rr, X86::CMP32rm, 0 }, + { X86::CMP64rr, X86::CMP64rm, 0 }, + { X86::CMP8rr, X86::CMP8rm, 0 }, + { X86::CVTSD2SSrr, X86::CVTSD2SSrm, 0 }, + { X86::CVTSI2SD64rr, X86::CVTSI2SD64rm, 0 }, + { X86::CVTSI2SDrr, X86::CVTSI2SDrm, 0 }, + { X86::CVTSI2SS64rr, X86::CVTSI2SS64rm, 0 }, + { X86::CVTSI2SSrr, X86::CVTSI2SSrm, 0 }, + { X86::CVTSS2SDrr, X86::CVTSS2SDrm, 0 }, + { X86::CVTTSD2SI64rr, X86::CVTTSD2SI64rm, 0 }, + { X86::CVTTSD2SIrr, X86::CVTTSD2SIrm, 0 }, + { X86::CVTTSS2SI64rr, X86::CVTTSS2SI64rm, 0 }, + { X86::CVTTSS2SIrr, X86::CVTTSS2SIrm, 0 }, + { X86::FsMOVAPDrr, X86::MOVSDrm, TB_NO_REVERSE }, + { X86::FsMOVAPSrr, X86::MOVSSrm, TB_NO_REVERSE }, + { X86::IMUL16rri, X86::IMUL16rmi, 0 }, + { X86::IMUL16rri8, X86::IMUL16rmi8, 0 }, + { X86::IMUL32rri, X86::IMUL32rmi, 0 }, + { X86::IMUL32rri8, X86::IMUL32rmi8, 0 }, + { X86::IMUL64rri32, X86::IMUL64rmi32, 0 }, + { X86::IMUL64rri8, X86::IMUL64rmi8, 0 }, + { X86::Int_COMISDrr, X86::Int_COMISDrm, 0 }, + { X86::Int_COMISSrr, X86::Int_COMISSrm, 0 }, + { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, 0 }, + { X86::CVTSD2SIrr, X86::CVTSD2SIrm, 0 }, + { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, 0 }, + { X86::CVTSS2SIrr, X86::CVTSS2SIrm, 0 }, + { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 }, + { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 }, + { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, 0 }, + { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, 0 }, + { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, 0 }, + { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, 0 }, + { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, 0 }, + { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, 0 }, + { X86::MOV16rr, X86::MOV16rm, 0 }, + { X86::MOV32rr, X86::MOV32rm, 0 }, + { X86::MOV64rr, X86::MOV64rm, 0 }, + { X86::MOV64toPQIrr, X86::MOVQI2PQIrm, 0 }, + { X86::MOV64toSDrr, X86::MOV64toSDrm, 0 }, + { X86::MOV8rr, X86::MOV8rm, 0 }, + { X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 }, + { X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 }, + { X86::MOVDDUPrr, X86::MOVDDUPrm, 0 }, + { X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 }, + { X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 }, + { X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 }, + { X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 }, + { X86::MOVSLDUPrr, X86::MOVSLDUPrm, TB_ALIGN_16 }, + { X86::MOVSX16rr8, X86::MOVSX16rm8, 0 }, + { X86::MOVSX32rr16, X86::MOVSX32rm16, 0 }, + { X86::MOVSX32rr8, X86::MOVSX32rm8, 0 }, + { X86::MOVSX64rr16, X86::MOVSX64rm16, 0 }, + { X86::MOVSX64rr32, X86::MOVSX64rm32, 0 }, + { X86::MOVSX64rr8, X86::MOVSX64rm8, 0 }, + { X86::MOVUPDrr, X86::MOVUPDrm, TB_ALIGN_16 }, + { X86::MOVUPSrr, X86::MOVUPSrm, 0 }, + { X86::MOVZDI2PDIrr, X86::MOVZDI2PDIrm, 0 }, + { X86::MOVZQI2PQIrr, X86::MOVZQI2PQIrm, 0 }, + { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, TB_ALIGN_16 }, + { X86::MOVZX16rr8, X86::MOVZX16rm8, 0 }, + { X86::MOVZX32rr16, X86::MOVZX32rm16, 0 }, + { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 }, + { X86::MOVZX32rr8, X86::MOVZX32rm8, 0 }, + { X86::MOVZX64rr16, X86::MOVZX64rm16, 0 }, + { X86::MOVZX64rr32, X86::MOVZX64rm32, 0 }, + { X86::MOVZX64rr8, X86::MOVZX64rm8, 0 }, + { X86::PABSBrr128, X86::PABSBrm128, TB_ALIGN_16 }, + { X86::PABSDrr128, X86::PABSDrm128, TB_ALIGN_16 }, + { X86::PABSWrr128, X86::PABSWrm128, TB_ALIGN_16 }, + { X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 }, + { X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 }, + { X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 }, + { X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 }, + { X86::RCPPSr_Int, X86::RCPPSm_Int, TB_ALIGN_16 }, + { X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 }, + { X86::RSQRTPSr_Int, X86::RSQRTPSm_Int, TB_ALIGN_16 }, + { X86::RSQRTSSr, X86::RSQRTSSm, 0 }, + { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, 0 }, + { X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 }, + { X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16 }, + { X86::SQRTSDr, X86::SQRTSDm, 0 }, + { X86::SQRTSDr_Int, X86::SQRTSDm_Int, 0 }, + { X86::SQRTSSr, X86::SQRTSSm, 0 }, + { X86::SQRTSSr_Int, X86::SQRTSSm_Int, 0 }, + { X86::TEST16rr, X86::TEST16rm, 0 }, + { X86::TEST32rr, X86::TEST32rm, 0 }, + { X86::TEST64rr, X86::TEST64rm, 0 }, + { X86::TEST8rr, X86::TEST8rm, 0 }, + // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0 + { X86::UCOMISDrr, X86::UCOMISDrm, 0 }, + { X86::UCOMISSrr, X86::UCOMISSrm, 0 }, + // AVX 128-bit versions of foldable instructions + { X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, 0 }, + { X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, 0 }, + { X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, 0 }, + { X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, 0 }, + { X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 }, + { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,0 }, + { X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 }, + { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, 0 }, + { X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 }, + { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,0 }, + { X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 }, + { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, 0 }, + { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, 0 }, + { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 }, + { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, 0 }, + { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, 0 }, + { X86::FsVMOVAPDrr, X86::VMOVSDrm, TB_NO_REVERSE }, + { X86::FsVMOVAPSrr, X86::VMOVSSrm, TB_NO_REVERSE }, + { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 }, + { X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 }, + { X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 }, + { X86::VMOVAPSrr, X86::VMOVAPSrm, TB_ALIGN_16 }, + { X86::VMOVDDUPrr, X86::VMOVDDUPrm, 0 }, + { X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 }, + { X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 }, + { X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 }, + { X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, TB_ALIGN_16 }, + { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, TB_ALIGN_16 }, + { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 }, + { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 }, + { X86::VMOVZDI2PDIrr, X86::VMOVZDI2PDIrm, 0 }, + { X86::VMOVZQI2PQIrr, X86::VMOVZQI2PQIrm, 0 }, + { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 }, + { X86::VPABSBrr128, X86::VPABSBrm128, 0 }, + { X86::VPABSDrr128, X86::VPABSDrm128, 0 }, + { X86::VPABSWrr128, X86::VPABSWrm128, 0 }, + { X86::VPERMILPDri, X86::VPERMILPDmi, 0 }, + { X86::VPERMILPSri, X86::VPERMILPSmi, 0 }, + { X86::VPSHUFDri, X86::VPSHUFDmi, 0 }, + { X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 }, + { X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 }, + { X86::VRCPPSr, X86::VRCPPSm, 0 }, + { X86::VRCPPSr_Int, X86::VRCPPSm_Int, 0 }, + { X86::VRSQRTPSr, X86::VRSQRTPSm, 0 }, + { X86::VRSQRTPSr_Int, X86::VRSQRTPSm_Int, 0 }, + { X86::VSQRTPDr, X86::VSQRTPDm, 0 }, + { X86::VSQRTPSr, X86::VSQRTPSm, 0 }, + { X86::VUCOMISDrr, X86::VUCOMISDrm, 0 }, + { X86::VUCOMISSrr, X86::VUCOMISSrm, 0 }, + { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE }, + + // AVX 256-bit foldable instructions + { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 }, + { X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 }, + { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 }, + { X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 }, + { X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 }, + { X86::VPERMILPDYri, X86::VPERMILPDYmi, 0 }, + { X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 }, + + // AVX2 foldable instructions + { X86::VPABSBrr256, X86::VPABSBrm256, 0 }, + { X86::VPABSDrr256, X86::VPABSDrm256, 0 }, + { X86::VPABSWrr256, X86::VPABSWrm256, 0 }, + { X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 }, + { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 }, + { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 }, + { X86::VRCPPSYr, X86::VRCPPSYm, 0 }, + { X86::VRCPPSYr_Int, X86::VRCPPSYm_Int, 0 }, + { X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 }, + { X86::VSQRTPDYr, X86::VSQRTPDYm, 0 }, + { X86::VSQRTPSYr, X86::VSQRTPSYm, 0 }, + { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE }, + { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE }, + + // BMI/BMI2/LZCNT/POPCNT foldable instructions + { X86::BEXTR32rr, X86::BEXTR32rm, 0 }, + { X86::BEXTR64rr, X86::BEXTR64rm, 0 }, + { X86::BLSI32rr, X86::BLSI32rm, 0 }, + { X86::BLSI64rr, X86::BLSI64rm, 0 }, + { X86::BLSMSK32rr, X86::BLSMSK32rm, 0 }, + { X86::BLSMSK64rr, X86::BLSMSK64rm, 0 }, + { X86::BLSR32rr, X86::BLSR32rm, 0 }, + { X86::BLSR64rr, X86::BLSR64rm, 0 }, + { X86::BZHI32rr, X86::BZHI32rm, 0 }, + { X86::BZHI64rr, X86::BZHI64rm, 0 }, + { X86::LZCNT16rr, X86::LZCNT16rm, 0 }, + { X86::LZCNT32rr, X86::LZCNT32rm, 0 }, + { X86::LZCNT64rr, X86::LZCNT64rm, 0 }, + { X86::POPCNT16rr, X86::POPCNT16rm, 0 }, + { X86::POPCNT32rr, X86::POPCNT32rm, 0 }, + { X86::POPCNT64rr, X86::POPCNT64rm, 0 }, + { X86::RORX32ri, X86::RORX32mi, 0 }, + { X86::RORX64ri, X86::RORX64mi, 0 }, + { X86::SARX32rr, X86::SARX32rm, 0 }, + { X86::SARX64rr, X86::SARX64rm, 0 }, + { X86::SHRX32rr, X86::SHRX32rm, 0 }, + { X86::SHRX64rr, X86::SHRX64rm, 0 }, + { X86::SHLX32rr, X86::SHLX32rm, 0 }, + { X86::SHLX64rr, X86::SHLX64rm, 0 }, + { X86::TZCNT16rr, X86::TZCNT16rm, 0 }, + { X86::TZCNT32rr, X86::TZCNT32rm, 0 }, + { X86::TZCNT64rr, X86::TZCNT64rm, 0 }, + }; + + for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) { + unsigned RegOp = OpTbl1[i].RegOp; + unsigned MemOp = OpTbl1[i].MemOp; + unsigned Flags = OpTbl1[i].Flags; + AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable, + RegOp, MemOp, + // Index 1, folded load + Flags | TB_INDEX_1 | TB_FOLDED_LOAD); + } + + static const X86OpTblEntry OpTbl2[] = { + { X86::ADC32rr, X86::ADC32rm, 0 }, + { X86::ADC64rr, X86::ADC64rm, 0 }, + { X86::ADD16rr, X86::ADD16rm, 0 }, + { X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE }, + { X86::ADD32rr, X86::ADD32rm, 0 }, + { X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE }, + { X86::ADD64rr, X86::ADD64rm, 0 }, + { X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE }, + { X86::ADD8rr, X86::ADD8rm, 0 }, + { X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 }, + { X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 }, + { X86::ADDSDrr, X86::ADDSDrm, 0 }, + { X86::ADDSSrr, X86::ADDSSrm, 0 }, + { X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 }, + { X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 }, + { X86::AND16rr, X86::AND16rm, 0 }, + { X86::AND32rr, X86::AND32rm, 0 }, + { X86::AND64rr, X86::AND64rm, 0 }, + { X86::AND8rr, X86::AND8rm, 0 }, + { X86::ANDNPDrr, X86::ANDNPDrm, TB_ALIGN_16 }, + { X86::ANDNPSrr, X86::ANDNPSrm, TB_ALIGN_16 }, + { X86::ANDPDrr, X86::ANDPDrm, TB_ALIGN_16 }, + { X86::ANDPSrr, X86::ANDPSrm, TB_ALIGN_16 }, + { X86::BLENDPDrri, X86::BLENDPDrmi, TB_ALIGN_16 }, + { X86::BLENDPSrri, X86::BLENDPSrmi, TB_ALIGN_16 }, + { X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16 }, + { X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16 }, + { X86::CMOVA16rr, X86::CMOVA16rm, 0 }, + { X86::CMOVA32rr, X86::CMOVA32rm, 0 }, + { X86::CMOVA64rr, X86::CMOVA64rm, 0 }, + { X86::CMOVAE16rr, X86::CMOVAE16rm, 0 }, + { X86::CMOVAE32rr, X86::CMOVAE32rm, 0 }, + { X86::CMOVAE64rr, X86::CMOVAE64rm, 0 }, + { X86::CMOVB16rr, X86::CMOVB16rm, 0 }, + { X86::CMOVB32rr, X86::CMOVB32rm, 0 }, + { X86::CMOVB64rr, X86::CMOVB64rm, 0 }, + { X86::CMOVBE16rr, X86::CMOVBE16rm, 0 }, + { X86::CMOVBE32rr, X86::CMOVBE32rm, 0 }, + { X86::CMOVBE64rr, X86::CMOVBE64rm, 0 }, + { X86::CMOVE16rr, X86::CMOVE16rm, 0 }, + { X86::CMOVE32rr, X86::CMOVE32rm, 0 }, + { X86::CMOVE64rr, X86::CMOVE64rm, 0 }, + { X86::CMOVG16rr, X86::CMOVG16rm, 0 }, + { X86::CMOVG32rr, X86::CMOVG32rm, 0 }, + { X86::CMOVG64rr, X86::CMOVG64rm, 0 }, + { X86::CMOVGE16rr, X86::CMOVGE16rm, 0 }, + { X86::CMOVGE32rr, X86::CMOVGE32rm, 0 }, + { X86::CMOVGE64rr, X86::CMOVGE64rm, 0 }, + { X86::CMOVL16rr, X86::CMOVL16rm, 0 }, + { X86::CMOVL32rr, X86::CMOVL32rm, 0 }, + { X86::CMOVL64rr, X86::CMOVL64rm, 0 }, + { X86::CMOVLE16rr, X86::CMOVLE16rm, 0 }, + { X86::CMOVLE32rr, X86::CMOVLE32rm, 0 }, + { X86::CMOVLE64rr, X86::CMOVLE64rm, 0 }, + { X86::CMOVNE16rr, X86::CMOVNE16rm, 0 }, + { X86::CMOVNE32rr, X86::CMOVNE32rm, 0 }, + { X86::CMOVNE64rr, X86::CMOVNE64rm, 0 }, + { X86::CMOVNO16rr, X86::CMOVNO16rm, 0 }, + { X86::CMOVNO32rr, X86::CMOVNO32rm, 0 }, + { X86::CMOVNO64rr, X86::CMOVNO64rm, 0 }, + { X86::CMOVNP16rr, X86::CMOVNP16rm, 0 }, + { X86::CMOVNP32rr, X86::CMOVNP32rm, 0 }, + { X86::CMOVNP64rr, X86::CMOVNP64rm, 0 }, + { X86::CMOVNS16rr, X86::CMOVNS16rm, 0 }, + { X86::CMOVNS32rr, X86::CMOVNS32rm, 0 }, + { X86::CMOVNS64rr, X86::CMOVNS64rm, 0 }, + { X86::CMOVO16rr, X86::CMOVO16rm, 0 }, + { X86::CMOVO32rr, X86::CMOVO32rm, 0 }, + { X86::CMOVO64rr, X86::CMOVO64rm, 0 }, + { X86::CMOVP16rr, X86::CMOVP16rm, 0 }, + { X86::CMOVP32rr, X86::CMOVP32rm, 0 }, + { X86::CMOVP64rr, X86::CMOVP64rm, 0 }, + { X86::CMOVS16rr, X86::CMOVS16rm, 0 }, + { X86::CMOVS32rr, X86::CMOVS32rm, 0 }, + { X86::CMOVS64rr, X86::CMOVS64rm, 0 }, + { X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16 }, + { X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 }, + { X86::CMPSDrr, X86::CMPSDrm, 0 }, + { X86::CMPSSrr, X86::CMPSSrm, 0 }, + { X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 }, + { X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 }, + { X86::DIVSDrr, X86::DIVSDrm, 0 }, + { X86::DIVSSrr, X86::DIVSSrm, 0 }, + { X86::FsANDNPDrr, X86::FsANDNPDrm, TB_ALIGN_16 }, + { X86::FsANDNPSrr, X86::FsANDNPSrm, TB_ALIGN_16 }, + { X86::FsANDPDrr, X86::FsANDPDrm, TB_ALIGN_16 }, + { X86::FsANDPSrr, X86::FsANDPSrm, TB_ALIGN_16 }, + { X86::FsORPDrr, X86::FsORPDrm, TB_ALIGN_16 }, + { X86::FsORPSrr, X86::FsORPSrm, TB_ALIGN_16 }, + { X86::FsXORPDrr, X86::FsXORPDrm, TB_ALIGN_16 }, + { X86::FsXORPSrr, X86::FsXORPSrm, TB_ALIGN_16 }, + { X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 }, + { X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 }, + { X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 }, + { X86::HSUBPSrr, X86::HSUBPSrm, TB_ALIGN_16 }, + { X86::IMUL16rr, X86::IMUL16rm, 0 }, + { X86::IMUL32rr, X86::IMUL32rm, 0 }, + { X86::IMUL64rr, X86::IMUL64rm, 0 }, + { X86::Int_CMPSDrr, X86::Int_CMPSDrm, 0 }, + { X86::Int_CMPSSrr, X86::Int_CMPSSrm, 0 }, + { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, 0 }, + { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 }, + { X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm, 0 }, + { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 }, + { X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm, 0 }, + { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, 0 }, + { X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 }, + { X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 }, + { X86::MAXSDrr, X86::MAXSDrm, 0 }, + { X86::MAXSSrr, X86::MAXSSrm, 0 }, + { X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 }, + { X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 }, + { X86::MINSDrr, X86::MINSDrm, 0 }, + { X86::MINSSrr, X86::MINSSrm, 0 }, + { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 }, + { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 }, + { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 }, + { X86::MULSDrr, X86::MULSDrm, 0 }, + { X86::MULSSrr, X86::MULSSrm, 0 }, + { X86::OR16rr, X86::OR16rm, 0 }, + { X86::OR32rr, X86::OR32rm, 0 }, + { X86::OR64rr, X86::OR64rm, 0 }, + { X86::OR8rr, X86::OR8rm, 0 }, + { X86::ORPDrr, X86::ORPDrm, TB_ALIGN_16 }, + { X86::ORPSrr, X86::ORPSrm, TB_ALIGN_16 }, + { X86::PACKSSDWrr, X86::PACKSSDWrm, TB_ALIGN_16 }, + { X86::PACKSSWBrr, X86::PACKSSWBrm, TB_ALIGN_16 }, + { X86::PACKUSDWrr, X86::PACKUSDWrm, TB_ALIGN_16 }, + { X86::PACKUSWBrr, X86::PACKUSWBrm, TB_ALIGN_16 }, + { X86::PADDBrr, X86::PADDBrm, TB_ALIGN_16 }, + { X86::PADDDrr, X86::PADDDrm, TB_ALIGN_16 }, + { X86::PADDQrr, X86::PADDQrm, TB_ALIGN_16 }, + { X86::PADDSBrr, X86::PADDSBrm, TB_ALIGN_16 }, + { X86::PADDSWrr, X86::PADDSWrm, TB_ALIGN_16 }, + { X86::PADDUSBrr, X86::PADDUSBrm, TB_ALIGN_16 }, + { X86::PADDUSWrr, X86::PADDUSWrm, TB_ALIGN_16 }, + { X86::PADDWrr, X86::PADDWrm, TB_ALIGN_16 }, + { X86::PALIGNR128rr, X86::PALIGNR128rm, TB_ALIGN_16 }, + { X86::PANDNrr, X86::PANDNrm, TB_ALIGN_16 }, + { X86::PANDrr, X86::PANDrm, TB_ALIGN_16 }, + { X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 }, + { X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16 }, + { X86::PBLENDWrri, X86::PBLENDWrmi, TB_ALIGN_16 }, + { X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16 }, + { X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16 }, + { X86::PCMPEQQrr, X86::PCMPEQQrm, TB_ALIGN_16 }, + { X86::PCMPEQWrr, X86::PCMPEQWrm, TB_ALIGN_16 }, + { X86::PCMPGTBrr, X86::PCMPGTBrm, TB_ALIGN_16 }, + { X86::PCMPGTDrr, X86::PCMPGTDrm, TB_ALIGN_16 }, + { X86::PCMPGTQrr, X86::PCMPGTQrm, TB_ALIGN_16 }, + { X86::PCMPGTWrr, X86::PCMPGTWrm, TB_ALIGN_16 }, + { X86::PHADDDrr, X86::PHADDDrm, TB_ALIGN_16 }, + { X86::PHADDWrr, X86::PHADDWrm, TB_ALIGN_16 }, + { X86::PHADDSWrr128, X86::PHADDSWrm128, TB_ALIGN_16 }, + { X86::PHSUBDrr, X86::PHSUBDrm, TB_ALIGN_16 }, + { X86::PHSUBSWrr128, X86::PHSUBSWrm128, TB_ALIGN_16 }, + { X86::PHSUBWrr, X86::PHSUBWrm, TB_ALIGN_16 }, + { X86::PINSRWrri, X86::PINSRWrmi, TB_ALIGN_16 }, + { X86::PMADDUBSWrr128, X86::PMADDUBSWrm128, TB_ALIGN_16 }, + { X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 }, + { X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 }, + { X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 }, + { X86::PMINSWrr, X86::PMINSWrm, TB_ALIGN_16 }, + { X86::PMINUBrr, X86::PMINUBrm, TB_ALIGN_16 }, + { X86::PMINSBrr, X86::PMINSBrm, TB_ALIGN_16 }, + { X86::PMINSDrr, X86::PMINSDrm, TB_ALIGN_16 }, + { X86::PMINUDrr, X86::PMINUDrm, TB_ALIGN_16 }, + { X86::PMINUWrr, X86::PMINUWrm, TB_ALIGN_16 }, + { X86::PMAXSBrr, X86::PMAXSBrm, TB_ALIGN_16 }, + { X86::PMAXSDrr, X86::PMAXSDrm, TB_ALIGN_16 }, + { X86::PMAXUDrr, X86::PMAXUDrm, TB_ALIGN_16 }, + { X86::PMAXUWrr, X86::PMAXUWrm, TB_ALIGN_16 }, + { X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 }, + { X86::PMULHRSWrr128, X86::PMULHRSWrm128, TB_ALIGN_16 }, + { X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 }, + { X86::PMULHWrr, X86::PMULHWrm, TB_ALIGN_16 }, + { X86::PMULLDrr, X86::PMULLDrm, TB_ALIGN_16 }, + { X86::PMULLWrr, X86::PMULLWrm, TB_ALIGN_16 }, + { X86::PMULUDQrr, X86::PMULUDQrm, TB_ALIGN_16 }, + { X86::PORrr, X86::PORrm, TB_ALIGN_16 }, + { X86::PSADBWrr, X86::PSADBWrm, TB_ALIGN_16 }, + { X86::PSHUFBrr, X86::PSHUFBrm, TB_ALIGN_16 }, + { X86::PSIGNBrr, X86::PSIGNBrm, TB_ALIGN_16 }, + { X86::PSIGNWrr, X86::PSIGNWrm, TB_ALIGN_16 }, + { X86::PSIGNDrr, X86::PSIGNDrm, TB_ALIGN_16 }, + { X86::PSLLDrr, X86::PSLLDrm, TB_ALIGN_16 }, + { X86::PSLLQrr, X86::PSLLQrm, TB_ALIGN_16 }, + { X86::PSLLWrr, X86::PSLLWrm, TB_ALIGN_16 }, + { X86::PSRADrr, X86::PSRADrm, TB_ALIGN_16 }, + { X86::PSRAWrr, X86::PSRAWrm, TB_ALIGN_16 }, + { X86::PSRLDrr, X86::PSRLDrm, TB_ALIGN_16 }, + { X86::PSRLQrr, X86::PSRLQrm, TB_ALIGN_16 }, + { X86::PSRLWrr, X86::PSRLWrm, TB_ALIGN_16 }, + { X86::PSUBBrr, X86::PSUBBrm, TB_ALIGN_16 }, + { X86::PSUBDrr, X86::PSUBDrm, TB_ALIGN_16 }, + { X86::PSUBSBrr, X86::PSUBSBrm, TB_ALIGN_16 }, + { X86::PSUBSWrr, X86::PSUBSWrm, TB_ALIGN_16 }, + { X86::PSUBWrr, X86::PSUBWrm, TB_ALIGN_16 }, + { X86::PUNPCKHBWrr, X86::PUNPCKHBWrm, TB_ALIGN_16 }, + { X86::PUNPCKHDQrr, X86::PUNPCKHDQrm, TB_ALIGN_16 }, + { X86::PUNPCKHQDQrr, X86::PUNPCKHQDQrm, TB_ALIGN_16 }, + { X86::PUNPCKHWDrr, X86::PUNPCKHWDrm, TB_ALIGN_16 }, + { X86::PUNPCKLBWrr, X86::PUNPCKLBWrm, TB_ALIGN_16 }, + { X86::PUNPCKLDQrr, X86::PUNPCKLDQrm, TB_ALIGN_16 }, + { X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 }, + { X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 }, + { X86::PXORrr, X86::PXORrm, TB_ALIGN_16 }, + { X86::SBB32rr, X86::SBB32rm, 0 }, + { X86::SBB64rr, X86::SBB64rm, 0 }, + { X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 }, + { X86::SHUFPSrri, X86::SHUFPSrmi, TB_ALIGN_16 }, + { X86::SUB16rr, X86::SUB16rm, 0 }, + { X86::SUB32rr, X86::SUB32rm, 0 }, + { X86::SUB64rr, X86::SUB64rm, 0 }, + { X86::SUB8rr, X86::SUB8rm, 0 }, + { X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 }, + { X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 }, + { X86::SUBSDrr, X86::SUBSDrm, 0 }, + { X86::SUBSSrr, X86::SUBSSrm, 0 }, + // FIXME: TEST*rr -> swapped operand of TEST*mr. + { X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 }, + { X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 }, + { X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16 }, + { X86::UNPCKLPSrr, X86::UNPCKLPSrm, TB_ALIGN_16 }, + { X86::XOR16rr, X86::XOR16rm, 0 }, + { X86::XOR32rr, X86::XOR32rm, 0 }, + { X86::XOR64rr, X86::XOR64rm, 0 }, + { X86::XOR8rr, X86::XOR8rm, 0 }, + { X86::XORPDrr, X86::XORPDrm, TB_ALIGN_16 }, + { X86::XORPSrr, X86::XORPSrm, TB_ALIGN_16 }, + // AVX 128-bit versions of foldable instructions + { X86::VCVTSD2SSrr, X86::VCVTSD2SSrm, 0 }, + { X86::Int_VCVTSD2SSrr, X86::Int_VCVTSD2SSrm, 0 }, + { X86::VCVTSI2SD64rr, X86::VCVTSI2SD64rm, 0 }, + { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm, 0 }, + { X86::VCVTSI2SDrr, X86::VCVTSI2SDrm, 0 }, + { X86::Int_VCVTSI2SDrr, X86::Int_VCVTSI2SDrm, 0 }, + { X86::VCVTSI2SS64rr, X86::VCVTSI2SS64rm, 0 }, + { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm, 0 }, + { X86::VCVTSI2SSrr, X86::VCVTSI2SSrm, 0 }, + { X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 }, + { X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 }, + { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, 0 }, + { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, 0 }, + { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 }, + { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 }, + { X86::VSQRTSDr, X86::VSQRTSDm, 0 }, + { X86::VSQRTSSr, X86::VSQRTSSm, 0 }, + { X86::VADDPDrr, X86::VADDPDrm, 0 }, + { X86::VADDPSrr, X86::VADDPSrm, 0 }, + { X86::VADDSDrr, X86::VADDSDrm, 0 }, + { X86::VADDSSrr, X86::VADDSSrm, 0 }, + { X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 }, + { X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 }, + { X86::VANDNPDrr, X86::VANDNPDrm, 0 }, + { X86::VANDNPSrr, X86::VANDNPSrm, 0 }, + { X86::VANDPDrr, X86::VANDPDrm, 0 }, + { X86::VANDPSrr, X86::VANDPSrm, 0 }, + { X86::VBLENDPDrri, X86::VBLENDPDrmi, 0 }, + { X86::VBLENDPSrri, X86::VBLENDPSrmi, 0 }, + { X86::VBLENDVPDrr, X86::VBLENDVPDrm, 0 }, + { X86::VBLENDVPSrr, X86::VBLENDVPSrm, 0 }, + { X86::VCMPPDrri, X86::VCMPPDrmi, 0 }, + { X86::VCMPPSrri, X86::VCMPPSrmi, 0 }, + { X86::VCMPSDrr, X86::VCMPSDrm, 0 }, + { X86::VCMPSSrr, X86::VCMPSSrm, 0 }, + { X86::VDIVPDrr, X86::VDIVPDrm, 0 }, + { X86::VDIVPSrr, X86::VDIVPSrm, 0 }, + { X86::VDIVSDrr, X86::VDIVSDrm, 0 }, + { X86::VDIVSSrr, X86::VDIVSSrm, 0 }, + { X86::VFsANDNPDrr, X86::VFsANDNPDrm, TB_ALIGN_16 }, + { X86::VFsANDNPSrr, X86::VFsANDNPSrm, TB_ALIGN_16 }, + { X86::VFsANDPDrr, X86::VFsANDPDrm, TB_ALIGN_16 }, + { X86::VFsANDPSrr, X86::VFsANDPSrm, TB_ALIGN_16 }, + { X86::VFsORPDrr, X86::VFsORPDrm, TB_ALIGN_16 }, + { X86::VFsORPSrr, X86::VFsORPSrm, TB_ALIGN_16 }, + { X86::VFsXORPDrr, X86::VFsXORPDrm, TB_ALIGN_16 }, + { X86::VFsXORPSrr, X86::VFsXORPSrm, TB_ALIGN_16 }, + { X86::VHADDPDrr, X86::VHADDPDrm, 0 }, + { X86::VHADDPSrr, X86::VHADDPSrm, 0 }, + { X86::VHSUBPDrr, X86::VHSUBPDrm, 0 }, + { X86::VHSUBPSrr, X86::VHSUBPSrm, 0 }, + { X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, 0 }, + { X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, 0 }, + { X86::VMAXPDrr, X86::VMAXPDrm, 0 }, + { X86::VMAXPSrr, X86::VMAXPSrm, 0 }, + { X86::VMAXSDrr, X86::VMAXSDrm, 0 }, + { X86::VMAXSSrr, X86::VMAXSSrm, 0 }, + { X86::VMINPDrr, X86::VMINPDrm, 0 }, + { X86::VMINPSrr, X86::VMINPSrm, 0 }, + { X86::VMINSDrr, X86::VMINSDrm, 0 }, + { X86::VMINSSrr, X86::VMINSSrm, 0 }, + { X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 }, + { X86::VMULPDrr, X86::VMULPDrm, 0 }, + { X86::VMULPSrr, X86::VMULPSrm, 0 }, + { X86::VMULSDrr, X86::VMULSDrm, 0 }, + { X86::VMULSSrr, X86::VMULSSrm, 0 }, + { X86::VORPDrr, X86::VORPDrm, 0 }, + { X86::VORPSrr, X86::VORPSrm, 0 }, + { X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 }, + { X86::VPACKSSWBrr, X86::VPACKSSWBrm, 0 }, + { X86::VPACKUSDWrr, X86::VPACKUSDWrm, 0 }, + { X86::VPACKUSWBrr, X86::VPACKUSWBrm, 0 }, + { X86::VPADDBrr, X86::VPADDBrm, 0 }, + { X86::VPADDDrr, X86::VPADDDrm, 0 }, + { X86::VPADDQrr, X86::VPADDQrm, 0 }, + { X86::VPADDSBrr, X86::VPADDSBrm, 0 }, + { X86::VPADDSWrr, X86::VPADDSWrm, 0 }, + { X86::VPADDUSBrr, X86::VPADDUSBrm, 0 }, + { X86::VPADDUSWrr, X86::VPADDUSWrm, 0 }, + { X86::VPADDWrr, X86::VPADDWrm, 0 }, + { X86::VPALIGNR128rr, X86::VPALIGNR128rm, 0 }, + { X86::VPANDNrr, X86::VPANDNrm, 0 }, + { X86::VPANDrr, X86::VPANDrm, 0 }, + { X86::VPAVGBrr, X86::VPAVGBrm, 0 }, + { X86::VPAVGWrr, X86::VPAVGWrm, 0 }, + { X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 }, + { X86::VPCMPEQBrr, X86::VPCMPEQBrm, 0 }, + { X86::VPCMPEQDrr, X86::VPCMPEQDrm, 0 }, + { X86::VPCMPEQQrr, X86::VPCMPEQQrm, 0 }, + { X86::VPCMPEQWrr, X86::VPCMPEQWrm, 0 }, + { X86::VPCMPGTBrr, X86::VPCMPGTBrm, 0 }, + { X86::VPCMPGTDrr, X86::VPCMPGTDrm, 0 }, + { X86::VPCMPGTQrr, X86::VPCMPGTQrm, 0 }, + { X86::VPCMPGTWrr, X86::VPCMPGTWrm, 0 }, + { X86::VPHADDDrr, X86::VPHADDDrm, 0 }, + { X86::VPHADDSWrr128, X86::VPHADDSWrm128, 0 }, + { X86::VPHADDWrr, X86::VPHADDWrm, 0 }, + { X86::VPHSUBDrr, X86::VPHSUBDrm, 0 }, + { X86::VPHSUBSWrr128, X86::VPHSUBSWrm128, 0 }, + { X86::VPHSUBWrr, X86::VPHSUBWrm, 0 }, + { X86::VPERMILPDrr, X86::VPERMILPDrm, 0 }, + { X86::VPERMILPSrr, X86::VPERMILPSrm, 0 }, + { X86::VPINSRWrri, X86::VPINSRWrmi, 0 }, + { X86::VPMADDUBSWrr128, X86::VPMADDUBSWrm128, 0 }, + { X86::VPMADDWDrr, X86::VPMADDWDrm, 0 }, + { X86::VPMAXSWrr, X86::VPMAXSWrm, 0 }, + { X86::VPMAXUBrr, X86::VPMAXUBrm, 0 }, + { X86::VPMINSWrr, X86::VPMINSWrm, 0 }, + { X86::VPMINUBrr, X86::VPMINUBrm, 0 }, + { X86::VPMINSBrr, X86::VPMINSBrm, 0 }, + { X86::VPMINSDrr, X86::VPMINSDrm, 0 }, + { X86::VPMINUDrr, X86::VPMINUDrm, 0 }, + { X86::VPMINUWrr, X86::VPMINUWrm, 0 }, + { X86::VPMAXSBrr, X86::VPMAXSBrm, 0 }, + { X86::VPMAXSDrr, X86::VPMAXSDrm, 0 }, + { X86::VPMAXUDrr, X86::VPMAXUDrm, 0 }, + { X86::VPMAXUWrr, X86::VPMAXUWrm, 0 }, + { X86::VPMULDQrr, X86::VPMULDQrm, 0 }, + { X86::VPMULHRSWrr128, X86::VPMULHRSWrm128, 0 }, + { X86::VPMULHUWrr, X86::VPMULHUWrm, 0 }, + { X86::VPMULHWrr, X86::VPMULHWrm, 0 }, + { X86::VPMULLDrr, X86::VPMULLDrm, 0 }, + { X86::VPMULLWrr, X86::VPMULLWrm, 0 }, + { X86::VPMULUDQrr, X86::VPMULUDQrm, 0 }, + { X86::VPORrr, X86::VPORrm, 0 }, + { X86::VPSADBWrr, X86::VPSADBWrm, 0 }, + { X86::VPSHUFBrr, X86::VPSHUFBrm, 0 }, + { X86::VPSIGNBrr, X86::VPSIGNBrm, 0 }, + { X86::VPSIGNWrr, X86::VPSIGNWrm, 0 }, + { X86::VPSIGNDrr, X86::VPSIGNDrm, 0 }, + { X86::VPSLLDrr, X86::VPSLLDrm, 0 }, + { X86::VPSLLQrr, X86::VPSLLQrm, 0 }, + { X86::VPSLLWrr, X86::VPSLLWrm, 0 }, + { X86::VPSRADrr, X86::VPSRADrm, 0 }, + { X86::VPSRAWrr, X86::VPSRAWrm, 0 }, + { X86::VPSRLDrr, X86::VPSRLDrm, 0 }, + { X86::VPSRLQrr, X86::VPSRLQrm, 0 }, + { X86::VPSRLWrr, X86::VPSRLWrm, 0 }, + { X86::VPSUBBrr, X86::VPSUBBrm, 0 }, + { X86::VPSUBDrr, X86::VPSUBDrm, 0 }, + { X86::VPSUBSBrr, X86::VPSUBSBrm, 0 }, + { X86::VPSUBSWrr, X86::VPSUBSWrm, 0 }, + { X86::VPSUBWrr, X86::VPSUBWrm, 0 }, + { X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, 0 }, + { X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, 0 }, + { X86::VPUNPCKHQDQrr, X86::VPUNPCKHQDQrm, 0 }, + { X86::VPUNPCKHWDrr, X86::VPUNPCKHWDrm, 0 }, + { X86::VPUNPCKLBWrr, X86::VPUNPCKLBWrm, 0 }, + { X86::VPUNPCKLDQrr, X86::VPUNPCKLDQrm, 0 }, + { X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 }, + { X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 }, + { X86::VPXORrr, X86::VPXORrm, 0 }, + { X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 }, + { X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 }, + { X86::VSUBPDrr, X86::VSUBPDrm, 0 }, + { X86::VSUBPSrr, X86::VSUBPSrm, 0 }, + { X86::VSUBSDrr, X86::VSUBSDrm, 0 }, + { X86::VSUBSSrr, X86::VSUBSSrm, 0 }, + { X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 }, + { X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 }, + { X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 }, + { X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, 0 }, + { X86::VXORPDrr, X86::VXORPDrm, 0 }, + { X86::VXORPSrr, X86::VXORPSrm, 0 }, + // AVX 256-bit foldable instructions + { X86::VADDPDYrr, X86::VADDPDYrm, 0 }, + { X86::VADDPSYrr, X86::VADDPSYrm, 0 }, + { X86::VADDSUBPDYrr, X86::VADDSUBPDYrm, 0 }, + { X86::VADDSUBPSYrr, X86::VADDSUBPSYrm, 0 }, + { X86::VANDNPDYrr, X86::VANDNPDYrm, 0 }, + { X86::VANDNPSYrr, X86::VANDNPSYrm, 0 }, + { X86::VANDPDYrr, X86::VANDPDYrm, 0 }, + { X86::VANDPSYrr, X86::VANDPSYrm, 0 }, + { X86::VBLENDPDYrri, X86::VBLENDPDYrmi, 0 }, + { X86::VBLENDPSYrri, X86::VBLENDPSYrmi, 0 }, + { X86::VBLENDVPDYrr, X86::VBLENDVPDYrm, 0 }, + { X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, 0 }, + { X86::VCMPPDYrri, X86::VCMPPDYrmi, 0 }, + { X86::VCMPPSYrri, X86::VCMPPSYrmi, 0 }, + { X86::VDIVPDYrr, X86::VDIVPDYrm, 0 }, + { X86::VDIVPSYrr, X86::VDIVPSYrm, 0 }, + { X86::VHADDPDYrr, X86::VHADDPDYrm, 0 }, + { X86::VHADDPSYrr, X86::VHADDPSYrm, 0 }, + { X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 }, + { X86::VHSUBPSYrr, X86::VHSUBPSYrm, 0 }, + { X86::VINSERTF128rr, X86::VINSERTF128rm, 0 }, + { X86::VMAXPDYrr, X86::VMAXPDYrm, 0 }, + { X86::VMAXPSYrr, X86::VMAXPSYrm, 0 }, + { X86::VMINPDYrr, X86::VMINPDYrm, 0 }, + { X86::VMINPSYrr, X86::VMINPSYrm, 0 }, + { X86::VMULPDYrr, X86::VMULPDYrm, 0 }, + { X86::VMULPSYrr, X86::VMULPSYrm, 0 }, + { X86::VORPDYrr, X86::VORPDYrm, 0 }, + { X86::VORPSYrr, X86::VORPSYrm, 0 }, + { X86::VPERM2F128rr, X86::VPERM2F128rm, 0 }, + { X86::VPERMILPDYrr, X86::VPERMILPDYrm, 0 }, + { X86::VPERMILPSYrr, X86::VPERMILPSYrm, 0 }, + { X86::VSHUFPDYrri, X86::VSHUFPDYrmi, 0 }, + { X86::VSHUFPSYrri, X86::VSHUFPSYrmi, 0 }, + { X86::VSUBPDYrr, X86::VSUBPDYrm, 0 }, + { X86::VSUBPSYrr, X86::VSUBPSYrm, 0 }, + { X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrm, 0 }, + { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrm, 0 }, + { X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrm, 0 }, + { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrm, 0 }, + { X86::VXORPDYrr, X86::VXORPDYrm, 0 }, + { X86::VXORPSYrr, X86::VXORPSYrm, 0 }, + // AVX2 foldable instructions + { X86::VINSERTI128rr, X86::VINSERTI128rm, 0 }, + { X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, 0 }, + { X86::VPACKSSWBYrr, X86::VPACKSSWBYrm, 0 }, + { X86::VPACKUSDWYrr, X86::VPACKUSDWYrm, 0 }, + { X86::VPACKUSWBYrr, X86::VPACKUSWBYrm, 0 }, + { X86::VPADDBYrr, X86::VPADDBYrm, 0 }, + { X86::VPADDDYrr, X86::VPADDDYrm, 0 }, + { X86::VPADDQYrr, X86::VPADDQYrm, 0 }, + { X86::VPADDSBYrr, X86::VPADDSBYrm, 0 }, + { X86::VPADDSWYrr, X86::VPADDSWYrm, 0 }, + { X86::VPADDUSBYrr, X86::VPADDUSBYrm, 0 }, + { X86::VPADDUSWYrr, X86::VPADDUSWYrm, 0 }, + { X86::VPADDWYrr, X86::VPADDWYrm, 0 }, + { X86::VPALIGNR256rr, X86::VPALIGNR256rm, 0 }, + { X86::VPANDNYrr, X86::VPANDNYrm, 0 }, + { X86::VPANDYrr, X86::VPANDYrm, 0 }, + { X86::VPAVGBYrr, X86::VPAVGBYrm, 0 }, + { X86::VPAVGWYrr, X86::VPAVGWYrm, 0 }, + { X86::VPBLENDDrri, X86::VPBLENDDrmi, 0 }, + { X86::VPBLENDDYrri, X86::VPBLENDDYrmi, 0 }, + { X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0 }, + { X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, 0 }, + { X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, 0 }, + { X86::VPCMPEQQYrr, X86::VPCMPEQQYrm, 0 }, + { X86::VPCMPEQWYrr, X86::VPCMPEQWYrm, 0 }, + { X86::VPCMPGTBYrr, X86::VPCMPGTBYrm, 0 }, + { X86::VPCMPGTDYrr, X86::VPCMPGTDYrm, 0 }, + { X86::VPCMPGTQYrr, X86::VPCMPGTQYrm, 0 }, + { X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, 0 }, + { X86::VPERM2I128rr, X86::VPERM2I128rm, 0 }, + { X86::VPERMDYrr, X86::VPERMDYrm, 0 }, + { X86::VPERMPDYri, X86::VPERMPDYmi, 0 }, + { X86::VPERMPSYrr, X86::VPERMPSYrm, 0 }, + { X86::VPERMQYri, X86::VPERMQYmi, 0 }, + { X86::VPHADDDYrr, X86::VPHADDDYrm, 0 }, + { X86::VPHADDSWrr256, X86::VPHADDSWrm256, 0 }, + { X86::VPHADDWYrr, X86::VPHADDWYrm, 0 }, + { X86::VPHSUBDYrr, X86::VPHSUBDYrm, 0 }, + { X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, 0 }, + { X86::VPHSUBWYrr, X86::VPHSUBWYrm, 0 }, + { X86::VPMADDUBSWrr256, X86::VPMADDUBSWrm256, 0 }, + { X86::VPMADDWDYrr, X86::VPMADDWDYrm, 0 }, + { X86::VPMAXSWYrr, X86::VPMAXSWYrm, 0 }, + { X86::VPMAXUBYrr, X86::VPMAXUBYrm, 0 }, + { X86::VPMINSWYrr, X86::VPMINSWYrm, 0 }, + { X86::VPMINUBYrr, X86::VPMINUBYrm, 0 }, + { X86::VPMINSBYrr, X86::VPMINSBYrm, 0 }, + { X86::VPMINSDYrr, X86::VPMINSDYrm, 0 }, + { X86::VPMINUDYrr, X86::VPMINUDYrm, 0 }, + { X86::VPMINUWYrr, X86::VPMINUWYrm, 0 }, + { X86::VPMAXSBYrr, X86::VPMAXSBYrm, 0 }, + { X86::VPMAXSDYrr, X86::VPMAXSDYrm, 0 }, + { X86::VPMAXUDYrr, X86::VPMAXUDYrm, 0 }, + { X86::VPMAXUWYrr, X86::VPMAXUWYrm, 0 }, + { X86::VMPSADBWYrri, X86::VMPSADBWYrmi, 0 }, + { X86::VPMULDQYrr, X86::VPMULDQYrm, 0 }, + { X86::VPMULHRSWrr256, X86::VPMULHRSWrm256, 0 }, + { X86::VPMULHUWYrr, X86::VPMULHUWYrm, 0 }, + { X86::VPMULHWYrr, X86::VPMULHWYrm, 0 }, + { X86::VPMULLDYrr, X86::VPMULLDYrm, 0 }, + { X86::VPMULLWYrr, X86::VPMULLWYrm, 0 }, + { X86::VPMULUDQYrr, X86::VPMULUDQYrm, 0 }, + { X86::VPORYrr, X86::VPORYrm, 0 }, + { X86::VPSADBWYrr, X86::VPSADBWYrm, 0 }, + { X86::VPSHUFBYrr, X86::VPSHUFBYrm, 0 }, + { X86::VPSIGNBYrr, X86::VPSIGNBYrm, 0 }, + { X86::VPSIGNWYrr, X86::VPSIGNWYrm, 0 }, + { X86::VPSIGNDYrr, X86::VPSIGNDYrm, 0 }, + { X86::VPSLLDYrr, X86::VPSLLDYrm, 0 }, + { X86::VPSLLQYrr, X86::VPSLLQYrm, 0 }, + { X86::VPSLLWYrr, X86::VPSLLWYrm, 0 }, + { X86::VPSLLVDrr, X86::VPSLLVDrm, 0 }, + { X86::VPSLLVDYrr, X86::VPSLLVDYrm, 0 }, + { X86::VPSLLVQrr, X86::VPSLLVQrm, 0 }, + { X86::VPSLLVQYrr, X86::VPSLLVQYrm, 0 }, + { X86::VPSRADYrr, X86::VPSRADYrm, 0 }, + { X86::VPSRAWYrr, X86::VPSRAWYrm, 0 }, + { X86::VPSRAVDrr, X86::VPSRAVDrm, 0 }, + { X86::VPSRAVDYrr, X86::VPSRAVDYrm, 0 }, + { X86::VPSRLDYrr, X86::VPSRLDYrm, 0 }, + { X86::VPSRLQYrr, X86::VPSRLQYrm, 0 }, + { X86::VPSRLWYrr, X86::VPSRLWYrm, 0 }, + { X86::VPSRLVDrr, X86::VPSRLVDrm, 0 }, + { X86::VPSRLVDYrr, X86::VPSRLVDYrm, 0 }, + { X86::VPSRLVQrr, X86::VPSRLVQrm, 0 }, + { X86::VPSRLVQYrr, X86::VPSRLVQYrm, 0 }, + { X86::VPSUBBYrr, X86::VPSUBBYrm, 0 }, + { X86::VPSUBDYrr, X86::VPSUBDYrm, 0 }, + { X86::VPSUBSBYrr, X86::VPSUBSBYrm, 0 }, + { X86::VPSUBSWYrr, X86::VPSUBSWYrm, 0 }, + { X86::VPSUBWYrr, X86::VPSUBWYrm, 0 }, + { X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, 0 }, + { X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, 0 }, + { X86::VPUNPCKHQDQYrr, X86::VPUNPCKHQDQYrm, 0 }, + { X86::VPUNPCKHWDYrr, X86::VPUNPCKHWDYrm, 0 }, + { X86::VPUNPCKLBWYrr, X86::VPUNPCKLBWYrm, 0 }, + { X86::VPUNPCKLDQYrr, X86::VPUNPCKLDQYrm, 0 }, + { X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, 0 }, + { X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, 0 }, + { X86::VPXORYrr, X86::VPXORYrm, 0 }, + // FIXME: add AVX 256-bit foldable instructions + + // FMA4 foldable patterns + { X86::VFMADDSS4rr, X86::VFMADDSS4mr, 0 }, + { X86::VFMADDSD4rr, X86::VFMADDSD4mr, 0 }, + { X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_16 }, + { X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_16 }, + { X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, TB_ALIGN_32 }, + { X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, TB_ALIGN_32 }, + { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, 0 }, + { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, 0 }, + { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_16 }, + { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_16 }, + { X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, TB_ALIGN_32 }, + { X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, TB_ALIGN_32 }, + { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, 0 }, + { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, 0 }, + { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_16 }, + { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_16 }, + { X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, TB_ALIGN_32 }, + { X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, TB_ALIGN_32 }, + { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, 0 }, + { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, 0 }, + { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_16 }, + { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_16 }, + { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, TB_ALIGN_32 }, + { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4mrY, TB_ALIGN_32 }, + { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_16 }, + { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_16 }, + { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4mrY, TB_ALIGN_32 }, + { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4mrY, TB_ALIGN_32 }, + { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_16 }, + { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_16 }, + { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, TB_ALIGN_32 }, + { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, TB_ALIGN_32 }, + + // BMI/BMI2 foldable instructions + { X86::ANDN32rr, X86::ANDN32rm, 0 }, + { X86::ANDN64rr, X86::ANDN64rm, 0 }, + { X86::MULX32rr, X86::MULX32rm, 0 }, + { X86::MULX64rr, X86::MULX64rm, 0 }, + { X86::PDEP32rr, X86::PDEP32rm, 0 }, + { X86::PDEP64rr, X86::PDEP64rm, 0 }, + { X86::PEXT32rr, X86::PEXT32rm, 0 }, + { X86::PEXT64rr, X86::PEXT64rm, 0 }, + }; + + for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) { + unsigned RegOp = OpTbl2[i].RegOp; + unsigned MemOp = OpTbl2[i].MemOp; + unsigned Flags = OpTbl2[i].Flags; + AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable, + RegOp, MemOp, + // Index 2, folded load + Flags | TB_INDEX_2 | TB_FOLDED_LOAD); + } + + static const X86OpTblEntry OpTbl3[] = { + // FMA foldable instructions + { X86::VFMADDSSr231r, X86::VFMADDSSr231m, 0 }, + { X86::VFMADDSDr231r, X86::VFMADDSDr231m, 0 }, + { X86::VFMADDSSr132r, X86::VFMADDSSr132m, 0 }, + { X86::VFMADDSDr132r, X86::VFMADDSDr132m, 0 }, + { X86::VFMADDSSr213r, X86::VFMADDSSr213m, 0 }, + { X86::VFMADDSDr213r, X86::VFMADDSDr213m, 0 }, + { X86::VFMADDSSr213r_Int, X86::VFMADDSSr213m_Int, 0 }, + { X86::VFMADDSDr213r_Int, X86::VFMADDSDr213m_Int, 0 }, + + { X86::VFMADDPSr231r, X86::VFMADDPSr231m, TB_ALIGN_16 }, + { X86::VFMADDPDr231r, X86::VFMADDPDr231m, TB_ALIGN_16 }, + { X86::VFMADDPSr132r, X86::VFMADDPSr132m, TB_ALIGN_16 }, + { X86::VFMADDPDr132r, X86::VFMADDPDr132m, TB_ALIGN_16 }, + { X86::VFMADDPSr213r, X86::VFMADDPSr213m, TB_ALIGN_16 }, + { X86::VFMADDPDr213r, X86::VFMADDPDr213m, TB_ALIGN_16 }, + { X86::VFMADDPSr231rY, X86::VFMADDPSr231mY, TB_ALIGN_32 }, + { X86::VFMADDPDr231rY, X86::VFMADDPDr231mY, TB_ALIGN_32 }, + { X86::VFMADDPSr132rY, X86::VFMADDPSr132mY, TB_ALIGN_32 }, + { X86::VFMADDPDr132rY, X86::VFMADDPDr132mY, TB_ALIGN_32 }, + { X86::VFMADDPSr213rY, X86::VFMADDPSr213mY, TB_ALIGN_32 }, + { X86::VFMADDPDr213rY, X86::VFMADDPDr213mY, TB_ALIGN_32 }, + + { X86::VFNMADDSSr231r, X86::VFNMADDSSr231m, 0 }, + { X86::VFNMADDSDr231r, X86::VFNMADDSDr231m, 0 }, + { X86::VFNMADDSSr132r, X86::VFNMADDSSr132m, 0 }, + { X86::VFNMADDSDr132r, X86::VFNMADDSDr132m, 0 }, + { X86::VFNMADDSSr213r, X86::VFNMADDSSr213m, 0 }, + { X86::VFNMADDSDr213r, X86::VFNMADDSDr213m, 0 }, + { X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr213m_Int, 0 }, + { X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr213m_Int, 0 }, + + { X86::VFNMADDPSr231r, X86::VFNMADDPSr231m, TB_ALIGN_16 }, + { X86::VFNMADDPDr231r, X86::VFNMADDPDr231m, TB_ALIGN_16 }, + { X86::VFNMADDPSr132r, X86::VFNMADDPSr132m, TB_ALIGN_16 }, + { X86::VFNMADDPDr132r, X86::VFNMADDPDr132m, TB_ALIGN_16 }, + { X86::VFNMADDPSr213r, X86::VFNMADDPSr213m, TB_ALIGN_16 }, + { X86::VFNMADDPDr213r, X86::VFNMADDPDr213m, TB_ALIGN_16 }, + { X86::VFNMADDPSr231rY, X86::VFNMADDPSr231mY, TB_ALIGN_32 }, + { X86::VFNMADDPDr231rY, X86::VFNMADDPDr231mY, TB_ALIGN_32 }, + { X86::VFNMADDPSr132rY, X86::VFNMADDPSr132mY, TB_ALIGN_32 }, + { X86::VFNMADDPDr132rY, X86::VFNMADDPDr132mY, TB_ALIGN_32 }, + { X86::VFNMADDPSr213rY, X86::VFNMADDPSr213mY, TB_ALIGN_32 }, + { X86::VFNMADDPDr213rY, X86::VFNMADDPDr213mY, TB_ALIGN_32 }, + + { X86::VFMSUBSSr231r, X86::VFMSUBSSr231m, 0 }, + { X86::VFMSUBSDr231r, X86::VFMSUBSDr231m, 0 }, + { X86::VFMSUBSSr132r, X86::VFMSUBSSr132m, 0 }, + { X86::VFMSUBSDr132r, X86::VFMSUBSDr132m, 0 }, + { X86::VFMSUBSSr213r, X86::VFMSUBSSr213m, 0 }, + { X86::VFMSUBSDr213r, X86::VFMSUBSDr213m, 0 }, + { X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr213m_Int, 0 }, + { X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr213m_Int, 0 }, + + { X86::VFMSUBPSr231r, X86::VFMSUBPSr231m, TB_ALIGN_16 }, + { X86::VFMSUBPDr231r, X86::VFMSUBPDr231m, TB_ALIGN_16 }, + { X86::VFMSUBPSr132r, X86::VFMSUBPSr132m, TB_ALIGN_16 }, + { X86::VFMSUBPDr132r, X86::VFMSUBPDr132m, TB_ALIGN_16 }, + { X86::VFMSUBPSr213r, X86::VFMSUBPSr213m, TB_ALIGN_16 }, + { X86::VFMSUBPDr213r, X86::VFMSUBPDr213m, TB_ALIGN_16 }, + { X86::VFMSUBPSr231rY, X86::VFMSUBPSr231mY, TB_ALIGN_32 }, + { X86::VFMSUBPDr231rY, X86::VFMSUBPDr231mY, TB_ALIGN_32 }, + { X86::VFMSUBPSr132rY, X86::VFMSUBPSr132mY, TB_ALIGN_32 }, + { X86::VFMSUBPDr132rY, X86::VFMSUBPDr132mY, TB_ALIGN_32 }, + { X86::VFMSUBPSr213rY, X86::VFMSUBPSr213mY, TB_ALIGN_32 }, + { X86::VFMSUBPDr213rY, X86::VFMSUBPDr213mY, TB_ALIGN_32 }, + + { X86::VFNMSUBSSr231r, X86::VFNMSUBSSr231m, 0 }, + { X86::VFNMSUBSDr231r, X86::VFNMSUBSDr231m, 0 }, + { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr132m, 0 }, + { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr132m, 0 }, + { X86::VFNMSUBSSr213r, X86::VFNMSUBSSr213m, 0 }, + { X86::VFNMSUBSDr213r, X86::VFNMSUBSDr213m, 0 }, + { X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr213m_Int, 0 }, + { X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr213m_Int, 0 }, + + { X86::VFNMSUBPSr231r, X86::VFNMSUBPSr231m, TB_ALIGN_16 }, + { X86::VFNMSUBPDr231r, X86::VFNMSUBPDr231m, TB_ALIGN_16 }, + { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr132m, TB_ALIGN_16 }, + { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr132m, TB_ALIGN_16 }, + { X86::VFNMSUBPSr213r, X86::VFNMSUBPSr213m, TB_ALIGN_16 }, + { X86::VFNMSUBPDr213r, X86::VFNMSUBPDr213m, TB_ALIGN_16 }, + { X86::VFNMSUBPSr231rY, X86::VFNMSUBPSr231mY, TB_ALIGN_32 }, + { X86::VFNMSUBPDr231rY, X86::VFNMSUBPDr231mY, TB_ALIGN_32 }, + { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr132mY, TB_ALIGN_32 }, + { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr132mY, TB_ALIGN_32 }, + { X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr213mY, TB_ALIGN_32 }, + { X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr213mY, TB_ALIGN_32 }, + + { X86::VFMADDSUBPSr231r, X86::VFMADDSUBPSr231m, TB_ALIGN_16 }, + { X86::VFMADDSUBPDr231r, X86::VFMADDSUBPDr231m, TB_ALIGN_16 }, + { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr132m, TB_ALIGN_16 }, + { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr132m, TB_ALIGN_16 }, + { X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr213m, TB_ALIGN_16 }, + { X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr213m, TB_ALIGN_16 }, + { X86::VFMADDSUBPSr231rY, X86::VFMADDSUBPSr231mY, TB_ALIGN_32 }, + { X86::VFMADDSUBPDr231rY, X86::VFMADDSUBPDr231mY, TB_ALIGN_32 }, + { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr132mY, TB_ALIGN_32 }, + { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr132mY, TB_ALIGN_32 }, + { X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr213mY, TB_ALIGN_32 }, + { X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr213mY, TB_ALIGN_32 }, + + { X86::VFMSUBADDPSr231r, X86::VFMSUBADDPSr231m, TB_ALIGN_16 }, + { X86::VFMSUBADDPDr231r, X86::VFMSUBADDPDr231m, TB_ALIGN_16 }, + { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr132m, TB_ALIGN_16 }, + { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr132m, TB_ALIGN_16 }, + { X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr213m, TB_ALIGN_16 }, + { X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr213m, TB_ALIGN_16 }, + { X86::VFMSUBADDPSr231rY, X86::VFMSUBADDPSr231mY, TB_ALIGN_32 }, + { X86::VFMSUBADDPDr231rY, X86::VFMSUBADDPDr231mY, TB_ALIGN_32 }, + { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr132mY, TB_ALIGN_32 }, + { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr132mY, TB_ALIGN_32 }, + { X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr213mY, TB_ALIGN_32 }, + { X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr213mY, TB_ALIGN_32 }, + + // FMA4 foldable patterns + { X86::VFMADDSS4rr, X86::VFMADDSS4rm, 0 }, + { X86::VFMADDSD4rr, X86::VFMADDSD4rm, 0 }, + { X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_16 }, + { X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_16 }, + { X86::VFMADDPS4rrY, X86::VFMADDPS4rmY, TB_ALIGN_32 }, + { X86::VFMADDPD4rrY, X86::VFMADDPD4rmY, TB_ALIGN_32 }, + { X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, 0 }, + { X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, 0 }, + { X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_16 }, + { X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_16 }, + { X86::VFNMADDPS4rrY, X86::VFNMADDPS4rmY, TB_ALIGN_32 }, + { X86::VFNMADDPD4rrY, X86::VFNMADDPD4rmY, TB_ALIGN_32 }, + { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, 0 }, + { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, 0 }, + { X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_16 }, + { X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_16 }, + { X86::VFMSUBPS4rrY, X86::VFMSUBPS4rmY, TB_ALIGN_32 }, + { X86::VFMSUBPD4rrY, X86::VFMSUBPD4rmY, TB_ALIGN_32 }, + { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, 0 }, + { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, 0 }, + { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_16 }, + { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_16 }, + { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4rmY, TB_ALIGN_32 }, + { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4rmY, TB_ALIGN_32 }, + { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_16 }, + { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_16 }, + { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4rmY, TB_ALIGN_32 }, + { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4rmY, TB_ALIGN_32 }, + { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_16 }, + { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_16 }, + { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4rmY, TB_ALIGN_32 }, + { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4rmY, TB_ALIGN_32 }, + }; + + for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) { + unsigned RegOp = OpTbl3[i].RegOp; + unsigned MemOp = OpTbl3[i].MemOp; + unsigned Flags = OpTbl3[i].Flags; + AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable, + RegOp, MemOp, + // Index 3, folded load + Flags | TB_INDEX_3 | TB_FOLDED_LOAD); + } + +} + +void +X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable, + MemOp2RegOpTableType &M2RTable, + unsigned RegOp, unsigned MemOp, unsigned Flags) { + if ((Flags & TB_NO_FORWARD) == 0) { + assert(!R2MTable.count(RegOp) && "Duplicate entry!"); + R2MTable[RegOp] = std::make_pair(MemOp, Flags); + } + if ((Flags & TB_NO_REVERSE) == 0) { + assert(!M2RTable.count(MemOp) && + "Duplicated entries in unfolding maps?"); + M2RTable[MemOp] = std::make_pair(RegOp, Flags); + } +} + +bool +X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned &SubIdx) const { + switch (MI.getOpcode()) { + default: break; + case X86::MOVSX16rr8: + case X86::MOVZX16rr8: + case X86::MOVSX32rr8: + case X86::MOVZX32rr8: + case X86::MOVSX64rr8: + case X86::MOVZX64rr8: + if (!TM.getSubtarget<X86Subtarget>().is64Bit()) + // It's not always legal to reference the low 8-bit of the larger + // register in 32-bit mode. + return false; + case X86::MOVSX32rr16: + case X86::MOVZX32rr16: + case X86::MOVSX64rr16: + case X86::MOVZX64rr16: + case X86::MOVSX64rr32: + case X86::MOVZX64rr32: { + if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg()) + // Be conservative. + return false; + SrcReg = MI.getOperand(1).getReg(); + DstReg = MI.getOperand(0).getReg(); + switch (MI.getOpcode()) { + default: llvm_unreachable("Unreachable!"); + case X86::MOVSX16rr8: + case X86::MOVZX16rr8: + case X86::MOVSX32rr8: + case X86::MOVZX32rr8: + case X86::MOVSX64rr8: + case X86::MOVZX64rr8: + SubIdx = X86::sub_8bit; + break; + case X86::MOVSX32rr16: + case X86::MOVZX32rr16: + case X86::MOVSX64rr16: + case X86::MOVZX64rr16: + SubIdx = X86::sub_16bit; + break; + case X86::MOVSX64rr32: + case X86::MOVZX64rr32: + SubIdx = X86::sub_32bit; + break; + } + return true; + } + } + return false; +} + +/// isFrameOperand - Return true and the FrameIndex if the specified +/// operand and follow operands form a reference to the stack frame. +bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op, + int &FrameIndex) const { + if (MI->getOperand(Op).isFI() && MI->getOperand(Op+1).isImm() && + MI->getOperand(Op+2).isReg() && MI->getOperand(Op+3).isImm() && + MI->getOperand(Op+1).getImm() == 1 && + MI->getOperand(Op+2).getReg() == 0 && + MI->getOperand(Op+3).getImm() == 0) { + FrameIndex = MI->getOperand(Op).getIndex(); + return true; + } + return false; +} + +static bool isFrameLoadOpcode(int Opcode) { + switch (Opcode) { + default: + return false; + case X86::MOV8rm: + case X86::MOV16rm: + case X86::MOV32rm: + case X86::MOV64rm: + case X86::LD_Fp64m: + case X86::MOVSSrm: + case X86::MOVSDrm: + case X86::MOVAPSrm: + case X86::MOVAPDrm: + case X86::MOVDQArm: + case X86::VMOVSSrm: + case X86::VMOVSDrm: + case X86::VMOVAPSrm: + case X86::VMOVAPDrm: + case X86::VMOVDQArm: + case X86::VMOVAPSYrm: + case X86::VMOVAPDYrm: + case X86::VMOVDQAYrm: + case X86::MMX_MOVD64rm: + case X86::MMX_MOVQ64rm: + return true; + } +} + +static bool isFrameStoreOpcode(int Opcode) { + switch (Opcode) { + default: break; + case X86::MOV8mr: + case X86::MOV16mr: + case X86::MOV32mr: + case X86::MOV64mr: + case X86::ST_FpP64m: + case X86::MOVSSmr: + case X86::MOVSDmr: + case X86::MOVAPSmr: + case X86::MOVAPDmr: + case X86::MOVDQAmr: + case X86::VMOVSSmr: + case X86::VMOVSDmr: + case X86::VMOVAPSmr: + case X86::VMOVAPDmr: + case X86::VMOVDQAmr: + case X86::VMOVAPSYmr: + case X86::VMOVAPDYmr: + case X86::VMOVDQAYmr: + case X86::MMX_MOVD64mr: + case X86::MMX_MOVQ64mr: + case X86::MMX_MOVNTQmr: + return true; + } + return false; +} + +unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + if (isFrameLoadOpcode(MI->getOpcode())) + if (MI->getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex)) + return MI->getOperand(0).getReg(); + return 0; +} + +unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const { + if (isFrameLoadOpcode(MI->getOpcode())) { + unsigned Reg; + if ((Reg = isLoadFromStackSlot(MI, FrameIndex))) + return Reg; + // Check for post-frame index elimination operations + const MachineMemOperand *Dummy; + return hasLoadFromStackSlot(MI, Dummy, FrameIndex); + } + return 0; +} + +unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + if (isFrameStoreOpcode(MI->getOpcode())) + if (MI->getOperand(X86::AddrNumOperands).getSubReg() == 0 && + isFrameOperand(MI, 0, FrameIndex)) + return MI->getOperand(X86::AddrNumOperands).getReg(); + return 0; +} + +unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const { + if (isFrameStoreOpcode(MI->getOpcode())) { + unsigned Reg; + if ((Reg = isStoreToStackSlot(MI, FrameIndex))) + return Reg; + // Check for post-frame index elimination operations + const MachineMemOperand *Dummy; + return hasStoreToStackSlot(MI, Dummy, FrameIndex); + } + return 0; +} + +/// regIsPICBase - Return true if register is PIC base (i.e.g defined by +/// X86::MOVPC32r. +static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) { + // Don't waste compile time scanning use-def chains of physregs. + if (!TargetRegisterInfo::isVirtualRegister(BaseReg)) + return false; + bool isPICBase = false; + for (MachineRegisterInfo::def_iterator I = MRI.def_begin(BaseReg), + E = MRI.def_end(); I != E; ++I) { + MachineInstr *DefMI = I.getOperand().getParent(); + if (DefMI->getOpcode() != X86::MOVPC32r) + return false; + assert(!isPICBase && "More than one PIC base?"); + isPICBase = true; + } + return isPICBase; +} + +bool +X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, + AliasAnalysis *AA) const { + switch (MI->getOpcode()) { + default: break; + case X86::MOV8rm: + case X86::MOV16rm: + case X86::MOV32rm: + case X86::MOV64rm: + case X86::LD_Fp64m: + case X86::MOVSSrm: + case X86::MOVSDrm: + case X86::MOVAPSrm: + case X86::MOVUPSrm: + case X86::MOVAPDrm: + case X86::MOVDQArm: + case X86::MOVDQUrm: + case X86::VMOVSSrm: + case X86::VMOVSDrm: + case X86::VMOVAPSrm: + case X86::VMOVUPSrm: + case X86::VMOVAPDrm: + case X86::VMOVDQArm: + case X86::VMOVDQUrm: + case X86::VMOVAPSYrm: + case X86::VMOVUPSYrm: + case X86::VMOVAPDYrm: + case X86::VMOVDQAYrm: + case X86::VMOVDQUYrm: + case X86::MMX_MOVD64rm: + case X86::MMX_MOVQ64rm: + case X86::FsVMOVAPSrm: + case X86::FsVMOVAPDrm: + case X86::FsMOVAPSrm: + case X86::FsMOVAPDrm: { + // Loads from constant pools are trivially rematerializable. + if (MI->getOperand(1).isReg() && + MI->getOperand(2).isImm() && + MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 && + MI->isInvariantLoad(AA)) { + unsigned BaseReg = MI->getOperand(1).getReg(); + if (BaseReg == 0 || BaseReg == X86::RIP) + return true; + // Allow re-materialization of PIC load. + if (!ReMatPICStubLoad && MI->getOperand(4).isGlobal()) + return false; + const MachineFunction &MF = *MI->getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + return regIsPICBase(BaseReg, MRI); + } + return false; + } + + case X86::LEA32r: + case X86::LEA64r: { + if (MI->getOperand(2).isImm() && + MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 && + !MI->getOperand(4).isReg()) { + // lea fi#, lea GV, etc. are all rematerializable. + if (!MI->getOperand(1).isReg()) + return true; + unsigned BaseReg = MI->getOperand(1).getReg(); + if (BaseReg == 0) + return true; + // Allow re-materialization of lea PICBase + x. + const MachineFunction &MF = *MI->getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + return regIsPICBase(BaseReg, MRI); + } + return false; + } + } + + // All other instructions marked M_REMATERIALIZABLE are always trivially + // rematerializable. + return true; +} + +/// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction that +/// would clobber the EFLAGS condition register. Note the result may be +/// conservative. If it cannot definitely determine the safety after visiting +/// a few instructions in each direction it assumes it's not safe. +static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { + MachineBasicBlock::iterator E = MBB.end(); + + // For compile time consideration, if we are not able to determine the + // safety after visiting 4 instructions in each direction, we will assume + // it's not safe. + MachineBasicBlock::iterator Iter = I; + for (unsigned i = 0; Iter != E && i < 4; ++i) { + bool SeenDef = false; + for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) { + MachineOperand &MO = Iter->getOperand(j); + if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS)) + SeenDef = true; + if (!MO.isReg()) + continue; + if (MO.getReg() == X86::EFLAGS) { + if (MO.isUse()) + return false; + SeenDef = true; + } + } + + if (SeenDef) + // This instruction defines EFLAGS, no need to look any further. + return true; + ++Iter; + // Skip over DBG_VALUE. + while (Iter != E && Iter->isDebugValue()) + ++Iter; + } + + // It is safe to clobber EFLAGS at the end of a block of no successor has it + // live in. + if (Iter == E) { + for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(), + SE = MBB.succ_end(); SI != SE; ++SI) + if ((*SI)->isLiveIn(X86::EFLAGS)) + return false; + return true; + } + + MachineBasicBlock::iterator B = MBB.begin(); + Iter = I; + for (unsigned i = 0; i < 4; ++i) { + // If we make it to the beginning of the block, it's safe to clobber + // EFLAGS iff EFLAGS is not live-in. + if (Iter == B) + return !MBB.isLiveIn(X86::EFLAGS); + + --Iter; + // Skip over DBG_VALUE. + while (Iter != B && Iter->isDebugValue()) + --Iter; + + bool SawKill = false; + for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) { + MachineOperand &MO = Iter->getOperand(j); + // A register mask may clobber EFLAGS, but we should still look for a + // live EFLAGS def. + if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS)) + SawKill = true; + if (MO.isReg() && MO.getReg() == X86::EFLAGS) { + if (MO.isDef()) return MO.isDead(); + if (MO.isKill()) SawKill = true; + } + } + + if (SawKill) + // This instruction kills EFLAGS and doesn't redefine it, so + // there's no need to look further. + return true; + } + + // Conservative answer. + return false; +} + +void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SubIdx, + const MachineInstr *Orig, + const TargetRegisterInfo &TRI) const { + DebugLoc DL = Orig->getDebugLoc(); + + // MOV32r0 etc. are implemented with xor which clobbers condition code. + // Re-materialize them as movri instructions to avoid side effects. + bool Clone = true; + unsigned Opc = Orig->getOpcode(); + switch (Opc) { + default: break; + case X86::MOV8r0: + case X86::MOV16r0: + case X86::MOV32r0: + case X86::MOV64r0: { + if (!isSafeToClobberEFLAGS(MBB, I)) { + switch (Opc) { + default: llvm_unreachable("Unreachable!"); + case X86::MOV8r0: Opc = X86::MOV8ri; break; + case X86::MOV16r0: Opc = X86::MOV16ri; break; + case X86::MOV32r0: Opc = X86::MOV32ri; break; + case X86::MOV64r0: Opc = X86::MOV64ri64i32; break; + } + Clone = false; + } + break; + } + } + + if (Clone) { + MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig); + MBB.insert(I, MI); + } else { + BuildMI(MBB, I, DL, get(Opc)).addOperand(Orig->getOperand(0)).addImm(0); + } + + MachineInstr *NewMI = prior(I); + NewMI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI); +} + +/// hasLiveCondCodeDef - True if MI has a condition code def, e.g. EFLAGS, that +/// is not marked dead. +static bool hasLiveCondCodeDef(MachineInstr *MI) { + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (MO.isReg() && MO.isDef() && + MO.getReg() == X86::EFLAGS && !MO.isDead()) { + return true; + } + } + return false; +} + +/// convertToThreeAddressWithLEA - Helper for convertToThreeAddress when +/// 16-bit LEA is disabled, use 32-bit LEA to form 3-address code by promoting +/// to a 32-bit superregister and then truncating back down to a 16-bit +/// subregister. +MachineInstr * +X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, + MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const { + MachineInstr *MI = MBBI; + unsigned Dest = MI->getOperand(0).getReg(); + unsigned Src = MI->getOperand(1).getReg(); + bool isDead = MI->getOperand(0).isDead(); + bool isKill = MI->getOperand(1).isKill(); + + unsigned Opc = TM.getSubtarget<X86Subtarget>().is64Bit() + ? X86::LEA64_32r : X86::LEA32r; + MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo(); + unsigned leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); + unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass); + + // Build and insert into an implicit UNDEF value. This is OK because + // well be shifting and then extracting the lower 16-bits. + // This has the potential to cause partial register stall. e.g. + // movw (%rbp,%rcx,2), %dx + // leal -65(%rdx), %esi + // But testing has shown this *does* help performance in 64-bit mode (at + // least on modern x86 machines). + BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg); + MachineInstr *InsMI = + BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY)) + .addReg(leaInReg, RegState::Define, X86::sub_16bit) + .addReg(Src, getKillRegState(isKill)); + + MachineInstrBuilder MIB = BuildMI(*MFI, MBBI, MI->getDebugLoc(), + get(Opc), leaOutReg); + switch (MIOpc) { + default: llvm_unreachable("Unreachable!"); + case X86::SHL16ri: { + unsigned ShAmt = MI->getOperand(2).getImm(); + MIB.addReg(0).addImm(1 << ShAmt) + .addReg(leaInReg, RegState::Kill).addImm(0).addReg(0); + break; + } + case X86::INC16r: + case X86::INC64_16r: + addRegOffset(MIB, leaInReg, true, 1); + break; + case X86::DEC16r: + case X86::DEC64_16r: + addRegOffset(MIB, leaInReg, true, -1); + break; + case X86::ADD16ri: + case X86::ADD16ri8: + case X86::ADD16ri_DB: + case X86::ADD16ri8_DB: + addRegOffset(MIB, leaInReg, true, MI->getOperand(2).getImm()); + break; + case X86::ADD16rr: + case X86::ADD16rr_DB: { + unsigned Src2 = MI->getOperand(2).getReg(); + bool isKill2 = MI->getOperand(2).isKill(); + unsigned leaInReg2 = 0; + MachineInstr *InsMI2 = 0; + if (Src == Src2) { + // ADD16rr %reg1028<kill>, %reg1028 + // just a single insert_subreg. + addRegReg(MIB, leaInReg, true, leaInReg, false); + } else { + leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); + // Build and insert into an implicit UNDEF value. This is OK because + // well be shifting and then extracting the lower 16-bits. + BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF),leaInReg2); + InsMI2 = + BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(TargetOpcode::COPY)) + .addReg(leaInReg2, RegState::Define, X86::sub_16bit) + .addReg(Src2, getKillRegState(isKill2)); + addRegReg(MIB, leaInReg, true, leaInReg2, true); + } + if (LV && isKill2 && InsMI2) + LV->replaceKillInstruction(Src2, MI, InsMI2); + break; + } + } + + MachineInstr *NewMI = MIB; + MachineInstr *ExtMI = + BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY)) + .addReg(Dest, RegState::Define | getDeadRegState(isDead)) + .addReg(leaOutReg, RegState::Kill, X86::sub_16bit); + + if (LV) { + // Update live variables + LV->getVarInfo(leaInReg).Kills.push_back(NewMI); + LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI); + if (isKill) + LV->replaceKillInstruction(Src, MI, InsMI); + if (isDead) + LV->replaceKillInstruction(Dest, MI, ExtMI); + } + + return ExtMI; +} + +/// convertToThreeAddress - This method must be implemented by targets that +/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target +/// may be able to convert a two-address instruction into a true +/// three-address instruction on demand. This allows the X86 target (for +/// example) to convert ADD and SHL instructions into LEA instructions if they +/// would require register copies due to two-addressness. +/// +/// This method returns a null pointer if the transformation cannot be +/// performed, otherwise it returns the new instruction. +/// +MachineInstr * +X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const { + MachineInstr *MI = MBBI; + MachineFunction &MF = *MI->getParent()->getParent(); + // All instructions input are two-addr instructions. Get the known operands. + const MachineOperand &Dest = MI->getOperand(0); + const MachineOperand &Src = MI->getOperand(1); + + MachineInstr *NewMI = NULL; + // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's. When + // we have better subtarget support, enable the 16-bit LEA generation here. + // 16-bit LEA is also slow on Core2. + bool DisableLEA16 = true; + bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit(); + + unsigned MIOpc = MI->getOpcode(); + switch (MIOpc) { + case X86::SHUFPSrri: { + assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!"); + if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0; + + unsigned B = MI->getOperand(1).getReg(); + unsigned C = MI->getOperand(2).getReg(); + if (B != C) return 0; + unsigned M = MI->getOperand(3).getImm(); + NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri)) + .addOperand(Dest).addOperand(Src).addImm(M); + break; + } + case X86::SHUFPDrri: { + assert(MI->getNumOperands() == 4 && "Unknown shufpd instruction!"); + if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0; + + unsigned B = MI->getOperand(1).getReg(); + unsigned C = MI->getOperand(2).getReg(); + if (B != C) return 0; + unsigned M = MI->getOperand(3).getImm(); + + // Convert to PSHUFD mask. + M = ((M & 1) << 1) | ((M & 1) << 3) | ((M & 2) << 4) | ((M & 2) << 6)| 0x44; + + NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri)) + .addOperand(Dest).addOperand(Src).addImm(M); + break; + } + case X86::SHL64ri: { + assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); + // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses + // the flags produced by a shift yet, so this is safe. + unsigned ShAmt = MI->getOperand(2).getImm(); + if (ShAmt == 0 || ShAmt >= 4) return 0; + + // LEA can't handle RSP. + if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && + !MF.getRegInfo().constrainRegClass(Src.getReg(), + &X86::GR64_NOSPRegClass)) + return 0; + + NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r)) + .addOperand(Dest) + .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0); + break; + } + case X86::SHL32ri: { + assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); + // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses + // the flags produced by a shift yet, so this is safe. + unsigned ShAmt = MI->getOperand(2).getImm(); + if (ShAmt == 0 || ShAmt >= 4) return 0; + + // LEA can't handle ESP. + if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && + !MF.getRegInfo().constrainRegClass(Src.getReg(), + &X86::GR32_NOSPRegClass)) + return 0; + + unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; + NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest) + .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0); + break; + } + case X86::SHL16ri: { + assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); + // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses + // the flags produced by a shift yet, so this is safe. + unsigned ShAmt = MI->getOperand(2).getImm(); + if (ShAmt == 0 || ShAmt >= 4) return 0; + + if (DisableLEA16) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; + NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest) + .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0); + break; + } + default: { + // The following opcodes also sets the condition code register(s). Only + // convert them to equivalent lea if the condition code register def's + // are dead! + if (hasLiveCondCodeDef(MI)) + return 0; + + switch (MIOpc) { + default: return 0; + case X86::INC64r: + case X86::INC32r: + case X86::INC64_32r: { + assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); + unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r + : (is64Bit ? X86::LEA64_32r : X86::LEA32r); + const TargetRegisterClass *RC = MIOpc == X86::INC64r ? + (const TargetRegisterClass*)&X86::GR64_NOSPRegClass : + (const TargetRegisterClass*)&X86::GR32_NOSPRegClass; + + // LEA can't handle RSP. + if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && + !MF.getRegInfo().constrainRegClass(Src.getReg(), RC)) + return 0; + + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest).addOperand(Src), 1); + break; + } + case X86::INC16r: + case X86::INC64_16r: + if (DisableLEA16) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; + assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest).addOperand(Src), 1); + break; + case X86::DEC64r: + case X86::DEC32r: + case X86::DEC64_32r: { + assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); + unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r + : (is64Bit ? X86::LEA64_32r : X86::LEA32r); + const TargetRegisterClass *RC = MIOpc == X86::DEC64r ? + (const TargetRegisterClass*)&X86::GR64_NOSPRegClass : + (const TargetRegisterClass*)&X86::GR32_NOSPRegClass; + // LEA can't handle RSP. + if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && + !MF.getRegInfo().constrainRegClass(Src.getReg(), RC)) + return 0; + + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest).addOperand(Src), -1); + break; + } + case X86::DEC16r: + case X86::DEC64_16r: + if (DisableLEA16) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; + assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest).addOperand(Src), -1); + break; + case X86::ADD64rr: + case X86::ADD64rr_DB: + case X86::ADD32rr: + case X86::ADD32rr_DB: { + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + unsigned Opc; + const TargetRegisterClass *RC; + if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) { + Opc = X86::LEA64r; + RC = &X86::GR64_NOSPRegClass; + } else { + Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; + RC = &X86::GR32_NOSPRegClass; + } + + + unsigned Src2 = MI->getOperand(2).getReg(); + bool isKill2 = MI->getOperand(2).isKill(); + + // LEA can't handle RSP. + if (TargetRegisterInfo::isVirtualRegister(Src2) && + !MF.getRegInfo().constrainRegClass(Src2, RC)) + return 0; + + NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest), + Src.getReg(), Src.isKill(), Src2, isKill2); + + // Preserve undefness of the operands. + bool isUndef = MI->getOperand(1).isUndef(); + bool isUndef2 = MI->getOperand(2).isUndef(); + NewMI->getOperand(1).setIsUndef(isUndef); + NewMI->getOperand(3).setIsUndef(isUndef2); + + if (LV && isKill2) + LV->replaceKillInstruction(Src2, MI, NewMI); + break; + } + case X86::ADD16rr: + case X86::ADD16rr_DB: { + if (DisableLEA16) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + unsigned Src2 = MI->getOperand(2).getReg(); + bool isKill2 = MI->getOperand(2).isKill(); + NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest), + Src.getReg(), Src.isKill(), Src2, isKill2); + + // Preserve undefness of the operands. + bool isUndef = MI->getOperand(1).isUndef(); + bool isUndef2 = MI->getOperand(2).isUndef(); + NewMI->getOperand(1).setIsUndef(isUndef); + NewMI->getOperand(3).setIsUndef(isUndef2); + + if (LV && isKill2) + LV->replaceKillInstruction(Src2, MI, NewMI); + break; + } + case X86::ADD64ri32: + case X86::ADD64ri8: + case X86::ADD64ri32_DB: + case X86::ADD64ri8_DB: + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r)) + .addOperand(Dest).addOperand(Src), + MI->getOperand(2).getImm()); + break; + case X86::ADD32ri: + case X86::ADD32ri8: + case X86::ADD32ri_DB: + case X86::ADD32ri8_DB: { + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest).addOperand(Src), + MI->getOperand(2).getImm()); + break; + } + case X86::ADD16ri: + case X86::ADD16ri8: + case X86::ADD16ri_DB: + case X86::ADD16ri8_DB: + if (DisableLEA16) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest).addOperand(Src), + MI->getOperand(2).getImm()); + break; + } + } + } + + if (!NewMI) return 0; + + if (LV) { // Update live variables + if (Src.isKill()) + LV->replaceKillInstruction(Src.getReg(), MI, NewMI); + if (Dest.isDead()) + LV->replaceKillInstruction(Dest.getReg(), MI, NewMI); + } + + MFI->insert(MBBI, NewMI); // Insert the new inst + return NewMI; +} + +/// commuteInstruction - We have a few instructions that must be hacked on to +/// commute them. +/// +MachineInstr * +X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { + switch (MI->getOpcode()) { + case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I) + case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I) + case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I) + case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I) + case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I) + case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I) + unsigned Opc; + unsigned Size; + switch (MI->getOpcode()) { + default: llvm_unreachable("Unreachable!"); + case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break; + case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break; + case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break; + case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break; + case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break; + case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break; + } + unsigned Amt = MI->getOperand(3).getImm(); + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->setDesc(get(Opc)); + MI->getOperand(3).setImm(Size-Amt); + return TargetInstrInfo::commuteInstruction(MI, NewMI); + } + case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr: + case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr: + case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr: + case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr: + case X86::CMOVBE16rr: case X86::CMOVBE32rr: case X86::CMOVBE64rr: + case X86::CMOVA16rr: case X86::CMOVA32rr: case X86::CMOVA64rr: + case X86::CMOVL16rr: case X86::CMOVL32rr: case X86::CMOVL64rr: + case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr: + case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr: + case X86::CMOVG16rr: case X86::CMOVG32rr: case X86::CMOVG64rr: + case X86::CMOVS16rr: case X86::CMOVS32rr: case X86::CMOVS64rr: + case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr: + case X86::CMOVP16rr: case X86::CMOVP32rr: case X86::CMOVP64rr: + case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr: + case X86::CMOVO16rr: case X86::CMOVO32rr: case X86::CMOVO64rr: + case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr: { + unsigned Opc; + switch (MI->getOpcode()) { + default: llvm_unreachable("Unreachable!"); + case X86::CMOVB16rr: Opc = X86::CMOVAE16rr; break; + case X86::CMOVB32rr: Opc = X86::CMOVAE32rr; break; + case X86::CMOVB64rr: Opc = X86::CMOVAE64rr; break; + case X86::CMOVAE16rr: Opc = X86::CMOVB16rr; break; + case X86::CMOVAE32rr: Opc = X86::CMOVB32rr; break; + case X86::CMOVAE64rr: Opc = X86::CMOVB64rr; break; + case X86::CMOVE16rr: Opc = X86::CMOVNE16rr; break; + case X86::CMOVE32rr: Opc = X86::CMOVNE32rr; break; + case X86::CMOVE64rr: Opc = X86::CMOVNE64rr; break; + case X86::CMOVNE16rr: Opc = X86::CMOVE16rr; break; + case X86::CMOVNE32rr: Opc = X86::CMOVE32rr; break; + case X86::CMOVNE64rr: Opc = X86::CMOVE64rr; break; + case X86::CMOVBE16rr: Opc = X86::CMOVA16rr; break; + case X86::CMOVBE32rr: Opc = X86::CMOVA32rr; break; + case X86::CMOVBE64rr: Opc = X86::CMOVA64rr; break; + case X86::CMOVA16rr: Opc = X86::CMOVBE16rr; break; + case X86::CMOVA32rr: Opc = X86::CMOVBE32rr; break; + case X86::CMOVA64rr: Opc = X86::CMOVBE64rr; break; + case X86::CMOVL16rr: Opc = X86::CMOVGE16rr; break; + case X86::CMOVL32rr: Opc = X86::CMOVGE32rr; break; + case X86::CMOVL64rr: Opc = X86::CMOVGE64rr; break; + case X86::CMOVGE16rr: Opc = X86::CMOVL16rr; break; + case X86::CMOVGE32rr: Opc = X86::CMOVL32rr; break; + case X86::CMOVGE64rr: Opc = X86::CMOVL64rr; break; + case X86::CMOVLE16rr: Opc = X86::CMOVG16rr; break; + case X86::CMOVLE32rr: Opc = X86::CMOVG32rr; break; + case X86::CMOVLE64rr: Opc = X86::CMOVG64rr; break; + case X86::CMOVG16rr: Opc = X86::CMOVLE16rr; break; + case X86::CMOVG32rr: Opc = X86::CMOVLE32rr; break; + case X86::CMOVG64rr: Opc = X86::CMOVLE64rr; break; + case X86::CMOVS16rr: Opc = X86::CMOVNS16rr; break; + case X86::CMOVS32rr: Opc = X86::CMOVNS32rr; break; + case X86::CMOVS64rr: Opc = X86::CMOVNS64rr; break; + case X86::CMOVNS16rr: Opc = X86::CMOVS16rr; break; + case X86::CMOVNS32rr: Opc = X86::CMOVS32rr; break; + case X86::CMOVNS64rr: Opc = X86::CMOVS64rr; break; + case X86::CMOVP16rr: Opc = X86::CMOVNP16rr; break; + case X86::CMOVP32rr: Opc = X86::CMOVNP32rr; break; + case X86::CMOVP64rr: Opc = X86::CMOVNP64rr; break; + case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break; + case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break; + case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break; + case X86::CMOVO16rr: Opc = X86::CMOVNO16rr; break; + case X86::CMOVO32rr: Opc = X86::CMOVNO32rr; break; + case X86::CMOVO64rr: Opc = X86::CMOVNO64rr; break; + case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break; + case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break; + case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break; + } + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->setDesc(get(Opc)); + // Fallthrough intended. + } + default: + return TargetInstrInfo::commuteInstruction(MI, NewMI); + } +} + +static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) { + switch (BrOpc) { + default: return X86::COND_INVALID; + case X86::JE_4: return X86::COND_E; + case X86::JNE_4: return X86::COND_NE; + case X86::JL_4: return X86::COND_L; + case X86::JLE_4: return X86::COND_LE; + case X86::JG_4: return X86::COND_G; + case X86::JGE_4: return X86::COND_GE; + case X86::JB_4: return X86::COND_B; + case X86::JBE_4: return X86::COND_BE; + case X86::JA_4: return X86::COND_A; + case X86::JAE_4: return X86::COND_AE; + case X86::JS_4: return X86::COND_S; + case X86::JNS_4: return X86::COND_NS; + case X86::JP_4: return X86::COND_P; + case X86::JNP_4: return X86::COND_NP; + case X86::JO_4: return X86::COND_O; + case X86::JNO_4: return X86::COND_NO; + } +} + +/// getCondFromSETOpc - return condition code of a SET opcode. +static X86::CondCode getCondFromSETOpc(unsigned Opc) { + switch (Opc) { + default: return X86::COND_INVALID; + case X86::SETAr: case X86::SETAm: return X86::COND_A; + case X86::SETAEr: case X86::SETAEm: return X86::COND_AE; + case X86::SETBr: case X86::SETBm: return X86::COND_B; + case X86::SETBEr: case X86::SETBEm: return X86::COND_BE; + case X86::SETEr: case X86::SETEm: return X86::COND_E; + case X86::SETGr: case X86::SETGm: return X86::COND_G; + case X86::SETGEr: case X86::SETGEm: return X86::COND_GE; + case X86::SETLr: case X86::SETLm: return X86::COND_L; + case X86::SETLEr: case X86::SETLEm: return X86::COND_LE; + case X86::SETNEr: case X86::SETNEm: return X86::COND_NE; + case X86::SETNOr: case X86::SETNOm: return X86::COND_NO; + case X86::SETNPr: case X86::SETNPm: return X86::COND_NP; + case X86::SETNSr: case X86::SETNSm: return X86::COND_NS; + case X86::SETOr: case X86::SETOm: return X86::COND_O; + case X86::SETPr: case X86::SETPm: return X86::COND_P; + case X86::SETSr: case X86::SETSm: return X86::COND_S; + } +} + +/// getCondFromCmovOpc - return condition code of a CMov opcode. +X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) { + switch (Opc) { + default: return X86::COND_INVALID; + case X86::CMOVA16rm: case X86::CMOVA16rr: case X86::CMOVA32rm: + case X86::CMOVA32rr: case X86::CMOVA64rm: case X86::CMOVA64rr: + return X86::COND_A; + case X86::CMOVAE16rm: case X86::CMOVAE16rr: case X86::CMOVAE32rm: + case X86::CMOVAE32rr: case X86::CMOVAE64rm: case X86::CMOVAE64rr: + return X86::COND_AE; + case X86::CMOVB16rm: case X86::CMOVB16rr: case X86::CMOVB32rm: + case X86::CMOVB32rr: case X86::CMOVB64rm: case X86::CMOVB64rr: + return X86::COND_B; + case X86::CMOVBE16rm: case X86::CMOVBE16rr: case X86::CMOVBE32rm: + case X86::CMOVBE32rr: case X86::CMOVBE64rm: case X86::CMOVBE64rr: + return X86::COND_BE; + case X86::CMOVE16rm: case X86::CMOVE16rr: case X86::CMOVE32rm: + case X86::CMOVE32rr: case X86::CMOVE64rm: case X86::CMOVE64rr: + return X86::COND_E; + case X86::CMOVG16rm: case X86::CMOVG16rr: case X86::CMOVG32rm: + case X86::CMOVG32rr: case X86::CMOVG64rm: case X86::CMOVG64rr: + return X86::COND_G; + case X86::CMOVGE16rm: case X86::CMOVGE16rr: case X86::CMOVGE32rm: + case X86::CMOVGE32rr: case X86::CMOVGE64rm: case X86::CMOVGE64rr: + return X86::COND_GE; + case X86::CMOVL16rm: case X86::CMOVL16rr: case X86::CMOVL32rm: + case X86::CMOVL32rr: case X86::CMOVL64rm: case X86::CMOVL64rr: + return X86::COND_L; + case X86::CMOVLE16rm: case X86::CMOVLE16rr: case X86::CMOVLE32rm: + case X86::CMOVLE32rr: case X86::CMOVLE64rm: case X86::CMOVLE64rr: + return X86::COND_LE; + case X86::CMOVNE16rm: case X86::CMOVNE16rr: case X86::CMOVNE32rm: + case X86::CMOVNE32rr: case X86::CMOVNE64rm: case X86::CMOVNE64rr: + return X86::COND_NE; + case X86::CMOVNO16rm: case X86::CMOVNO16rr: case X86::CMOVNO32rm: + case X86::CMOVNO32rr: case X86::CMOVNO64rm: case X86::CMOVNO64rr: + return X86::COND_NO; + case X86::CMOVNP16rm: case X86::CMOVNP16rr: case X86::CMOVNP32rm: + case X86::CMOVNP32rr: case X86::CMOVNP64rm: case X86::CMOVNP64rr: + return X86::COND_NP; + case X86::CMOVNS16rm: case X86::CMOVNS16rr: case X86::CMOVNS32rm: + case X86::CMOVNS32rr: case X86::CMOVNS64rm: case X86::CMOVNS64rr: + return X86::COND_NS; + case X86::CMOVO16rm: case X86::CMOVO16rr: case X86::CMOVO32rm: + case X86::CMOVO32rr: case X86::CMOVO64rm: case X86::CMOVO64rr: + return X86::COND_O; + case X86::CMOVP16rm: case X86::CMOVP16rr: case X86::CMOVP32rm: + case X86::CMOVP32rr: case X86::CMOVP64rm: case X86::CMOVP64rr: + return X86::COND_P; + case X86::CMOVS16rm: case X86::CMOVS16rr: case X86::CMOVS32rm: + case X86::CMOVS32rr: case X86::CMOVS64rm: case X86::CMOVS64rr: + return X86::COND_S; + } +} + +unsigned X86::GetCondBranchFromCond(X86::CondCode CC) { + switch (CC) { + default: llvm_unreachable("Illegal condition code!"); + case X86::COND_E: return X86::JE_4; + case X86::COND_NE: return X86::JNE_4; + case X86::COND_L: return X86::JL_4; + case X86::COND_LE: return X86::JLE_4; + case X86::COND_G: return X86::JG_4; + case X86::COND_GE: return X86::JGE_4; + case X86::COND_B: return X86::JB_4; + case X86::COND_BE: return X86::JBE_4; + case X86::COND_A: return X86::JA_4; + case X86::COND_AE: return X86::JAE_4; + case X86::COND_S: return X86::JS_4; + case X86::COND_NS: return X86::JNS_4; + case X86::COND_P: return X86::JP_4; + case X86::COND_NP: return X86::JNP_4; + case X86::COND_O: return X86::JO_4; + case X86::COND_NO: return X86::JNO_4; + } +} + +/// GetOppositeBranchCondition - Return the inverse of the specified condition, +/// e.g. turning COND_E to COND_NE. +X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) { + switch (CC) { + default: llvm_unreachable("Illegal condition code!"); + case X86::COND_E: return X86::COND_NE; + case X86::COND_NE: return X86::COND_E; + case X86::COND_L: return X86::COND_GE; + case X86::COND_LE: return X86::COND_G; + case X86::COND_G: return X86::COND_LE; + case X86::COND_GE: return X86::COND_L; + case X86::COND_B: return X86::COND_AE; + case X86::COND_BE: return X86::COND_A; + case X86::COND_A: return X86::COND_BE; + case X86::COND_AE: return X86::COND_B; + case X86::COND_S: return X86::COND_NS; + case X86::COND_NS: return X86::COND_S; + case X86::COND_P: return X86::COND_NP; + case X86::COND_NP: return X86::COND_P; + case X86::COND_O: return X86::COND_NO; + case X86::COND_NO: return X86::COND_O; + } +} + +/// getSwappedCondition - assume the flags are set by MI(a,b), return +/// the condition code if we modify the instructions such that flags are +/// set by MI(b,a). +static X86::CondCode getSwappedCondition(X86::CondCode CC) { + switch (CC) { + default: return X86::COND_INVALID; + case X86::COND_E: return X86::COND_E; + case X86::COND_NE: return X86::COND_NE; + case X86::COND_L: return X86::COND_G; + case X86::COND_LE: return X86::COND_GE; + case X86::COND_G: return X86::COND_L; + case X86::COND_GE: return X86::COND_LE; + case X86::COND_B: return X86::COND_A; + case X86::COND_BE: return X86::COND_AE; + case X86::COND_A: return X86::COND_B; + case X86::COND_AE: return X86::COND_BE; + } +} + +/// getSETFromCond - Return a set opcode for the given condition and +/// whether it has memory operand. +static unsigned getSETFromCond(X86::CondCode CC, + bool HasMemoryOperand) { + static const uint16_t Opc[16][2] = { + { X86::SETAr, X86::SETAm }, + { X86::SETAEr, X86::SETAEm }, + { X86::SETBr, X86::SETBm }, + { X86::SETBEr, X86::SETBEm }, + { X86::SETEr, X86::SETEm }, + { X86::SETGr, X86::SETGm }, + { X86::SETGEr, X86::SETGEm }, + { X86::SETLr, X86::SETLm }, + { X86::SETLEr, X86::SETLEm }, + { X86::SETNEr, X86::SETNEm }, + { X86::SETNOr, X86::SETNOm }, + { X86::SETNPr, X86::SETNPm }, + { X86::SETNSr, X86::SETNSm }, + { X86::SETOr, X86::SETOm }, + { X86::SETPr, X86::SETPm }, + { X86::SETSr, X86::SETSm } + }; + + assert(CC < 16 && "Can only handle standard cond codes"); + return Opc[CC][HasMemoryOperand ? 1 : 0]; +} + +/// getCMovFromCond - Return a cmov opcode for the given condition, +/// register size in bytes, and operand type. +static unsigned getCMovFromCond(X86::CondCode CC, unsigned RegBytes, + bool HasMemoryOperand) { + static const uint16_t Opc[32][3] = { + { X86::CMOVA16rr, X86::CMOVA32rr, X86::CMOVA64rr }, + { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr }, + { X86::CMOVB16rr, X86::CMOVB32rr, X86::CMOVB64rr }, + { X86::CMOVBE16rr, X86::CMOVBE32rr, X86::CMOVBE64rr }, + { X86::CMOVE16rr, X86::CMOVE32rr, X86::CMOVE64rr }, + { X86::CMOVG16rr, X86::CMOVG32rr, X86::CMOVG64rr }, + { X86::CMOVGE16rr, X86::CMOVGE32rr, X86::CMOVGE64rr }, + { X86::CMOVL16rr, X86::CMOVL32rr, X86::CMOVL64rr }, + { X86::CMOVLE16rr, X86::CMOVLE32rr, X86::CMOVLE64rr }, + { X86::CMOVNE16rr, X86::CMOVNE32rr, X86::CMOVNE64rr }, + { X86::CMOVNO16rr, X86::CMOVNO32rr, X86::CMOVNO64rr }, + { X86::CMOVNP16rr, X86::CMOVNP32rr, X86::CMOVNP64rr }, + { X86::CMOVNS16rr, X86::CMOVNS32rr, X86::CMOVNS64rr }, + { X86::CMOVO16rr, X86::CMOVO32rr, X86::CMOVO64rr }, + { X86::CMOVP16rr, X86::CMOVP32rr, X86::CMOVP64rr }, + { X86::CMOVS16rr, X86::CMOVS32rr, X86::CMOVS64rr }, + { X86::CMOVA16rm, X86::CMOVA32rm, X86::CMOVA64rm }, + { X86::CMOVAE16rm, X86::CMOVAE32rm, X86::CMOVAE64rm }, + { X86::CMOVB16rm, X86::CMOVB32rm, X86::CMOVB64rm }, + { X86::CMOVBE16rm, X86::CMOVBE32rm, X86::CMOVBE64rm }, + { X86::CMOVE16rm, X86::CMOVE32rm, X86::CMOVE64rm }, + { X86::CMOVG16rm, X86::CMOVG32rm, X86::CMOVG64rm }, + { X86::CMOVGE16rm, X86::CMOVGE32rm, X86::CMOVGE64rm }, + { X86::CMOVL16rm, X86::CMOVL32rm, X86::CMOVL64rm }, + { X86::CMOVLE16rm, X86::CMOVLE32rm, X86::CMOVLE64rm }, + { X86::CMOVNE16rm, X86::CMOVNE32rm, X86::CMOVNE64rm }, + { X86::CMOVNO16rm, X86::CMOVNO32rm, X86::CMOVNO64rm }, + { X86::CMOVNP16rm, X86::CMOVNP32rm, X86::CMOVNP64rm }, + { X86::CMOVNS16rm, X86::CMOVNS32rm, X86::CMOVNS64rm }, + { X86::CMOVO16rm, X86::CMOVO32rm, X86::CMOVO64rm }, + { X86::CMOVP16rm, X86::CMOVP32rm, X86::CMOVP64rm }, + { X86::CMOVS16rm, X86::CMOVS32rm, X86::CMOVS64rm } + }; + + assert(CC < 16 && "Can only handle standard cond codes"); + unsigned Idx = HasMemoryOperand ? 16+CC : CC; + switch(RegBytes) { + default: llvm_unreachable("Illegal register size!"); + case 2: return Opc[Idx][0]; + case 4: return Opc[Idx][1]; + case 8: return Opc[Idx][2]; + } +} + +bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const { + if (!MI->isTerminator()) return false; + + // Conditional branch is a special case. + if (MI->isBranch() && !MI->isBarrier()) + return true; + if (!MI->isPredicable()) + return true; + return !isPredicated(MI); +} + +bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + // Start from the bottom of the block and work up, examining the + // terminator instructions. + MachineBasicBlock::iterator I = MBB.end(); + MachineBasicBlock::iterator UnCondBrIter = MBB.end(); + while (I != MBB.begin()) { + --I; + if (I->isDebugValue()) + continue; + + // Working from the bottom, when we see a non-terminator instruction, we're + // done. + if (!isUnpredicatedTerminator(I)) + break; + + // A terminator that isn't a branch can't easily be handled by this + // analysis. + if (!I->isBranch()) + return true; + + // Handle unconditional branches. + if (I->getOpcode() == X86::JMP_4) { + UnCondBrIter = I; + + if (!AllowModify) { + TBB = I->getOperand(0).getMBB(); + continue; + } + + // If the block has any instructions after a JMP, delete them. + while (llvm::next(I) != MBB.end()) + llvm::next(I)->eraseFromParent(); + + Cond.clear(); + FBB = 0; + + // Delete the JMP if it's equivalent to a fall-through. + if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) { + TBB = 0; + I->eraseFromParent(); + I = MBB.end(); + UnCondBrIter = MBB.end(); + continue; + } + + // TBB is used to indicate the unconditional destination. + TBB = I->getOperand(0).getMBB(); + continue; + } + + // Handle conditional branches. + X86::CondCode BranchCode = getCondFromBranchOpc(I->getOpcode()); + if (BranchCode == X86::COND_INVALID) + return true; // Can't handle indirect branch. + + // Working from the bottom, handle the first conditional branch. + if (Cond.empty()) { + MachineBasicBlock *TargetBB = I->getOperand(0).getMBB(); + if (AllowModify && UnCondBrIter != MBB.end() && + MBB.isLayoutSuccessor(TargetBB)) { + // If we can modify the code and it ends in something like: + // + // jCC L1 + // jmp L2 + // L1: + // ... + // L2: + // + // Then we can change this to: + // + // jnCC L2 + // L1: + // ... + // L2: + // + // Which is a bit more efficient. + // We conditionally jump to the fall-through block. + BranchCode = GetOppositeBranchCondition(BranchCode); + unsigned JNCC = GetCondBranchFromCond(BranchCode); + MachineBasicBlock::iterator OldInst = I; + + BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC)) + .addMBB(UnCondBrIter->getOperand(0).getMBB()); + BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_4)) + .addMBB(TargetBB); + + OldInst->eraseFromParent(); + UnCondBrIter->eraseFromParent(); + + // Restart the analysis. + UnCondBrIter = MBB.end(); + I = MBB.end(); + continue; + } + + FBB = TBB; + TBB = I->getOperand(0).getMBB(); + Cond.push_back(MachineOperand::CreateImm(BranchCode)); + continue; + } + + // Handle subsequent conditional branches. Only handle the case where all + // conditional branches branch to the same destination and their condition + // opcodes fit one of the special multi-branch idioms. + assert(Cond.size() == 1); + assert(TBB); + + // Only handle the case where all conditional branches branch to the same + // destination. + if (TBB != I->getOperand(0).getMBB()) + return true; + + // If the conditions are the same, we can leave them alone. + X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm(); + if (OldBranchCode == BranchCode) + continue; + + // If they differ, see if they fit one of the known patterns. Theoretically, + // we could handle more patterns here, but we shouldn't expect to see them + // if instruction selection has done a reasonable job. + if ((OldBranchCode == X86::COND_NP && + BranchCode == X86::COND_E) || + (OldBranchCode == X86::COND_E && + BranchCode == X86::COND_NP)) + BranchCode = X86::COND_NP_OR_E; + else if ((OldBranchCode == X86::COND_P && + BranchCode == X86::COND_NE) || + (OldBranchCode == X86::COND_NE && + BranchCode == X86::COND_P)) + BranchCode = X86::COND_NE_OR_P; + else + return true; + + // Update the MachineOperand. + Cond[0].setImm(BranchCode); + } + + return false; +} + +unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator I = MBB.end(); + unsigned Count = 0; + + while (I != MBB.begin()) { + --I; + if (I->isDebugValue()) + continue; + if (I->getOpcode() != X86::JMP_4 && + getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID) + break; + // Remove the branch. + I->eraseFromParent(); + I = MBB.end(); + ++Count; + } + + return Count; +} + +unsigned +X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const SmallVectorImpl<MachineOperand> &Cond, + DebugLoc DL) const { + // Shouldn't be a fall through. + assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + assert((Cond.size() == 1 || Cond.size() == 0) && + "X86 branch conditions have one component!"); + + if (Cond.empty()) { + // Unconditional branch? + assert(!FBB && "Unconditional branch with multiple successors!"); + BuildMI(&MBB, DL, get(X86::JMP_4)).addMBB(TBB); + return 1; + } + + // Conditional branch. + unsigned Count = 0; + X86::CondCode CC = (X86::CondCode)Cond[0].getImm(); + switch (CC) { + case X86::COND_NP_OR_E: + // Synthesize NP_OR_E with two branches. + BuildMI(&MBB, DL, get(X86::JNP_4)).addMBB(TBB); + ++Count; + BuildMI(&MBB, DL, get(X86::JE_4)).addMBB(TBB); + ++Count; + break; + case X86::COND_NE_OR_P: + // Synthesize NE_OR_P with two branches. + BuildMI(&MBB, DL, get(X86::JNE_4)).addMBB(TBB); + ++Count; + BuildMI(&MBB, DL, get(X86::JP_4)).addMBB(TBB); + ++Count; + break; + default: { + unsigned Opc = GetCondBranchFromCond(CC); + BuildMI(&MBB, DL, get(Opc)).addMBB(TBB); + ++Count; + } + } + if (FBB) { + // Two-way Conditional branch. Insert the second branch. + BuildMI(&MBB, DL, get(X86::JMP_4)).addMBB(FBB); + ++Count; + } + return Count; +} + +bool X86InstrInfo:: +canInsertSelect(const MachineBasicBlock &MBB, + const SmallVectorImpl<MachineOperand> &Cond, + unsigned TrueReg, unsigned FalseReg, + int &CondCycles, int &TrueCycles, int &FalseCycles) const { + // Not all subtargets have cmov instructions. + if (!TM.getSubtarget<X86Subtarget>().hasCMov()) + return false; + if (Cond.size() != 1) + return false; + // We cannot do the composite conditions, at least not in SSA form. + if ((X86::CondCode)Cond[0].getImm() > X86::COND_S) + return false; + + // Check register classes. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RC = + RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); + if (!RC) + return false; + + // We have cmov instructions for 16, 32, and 64 bit general purpose registers. + if (X86::GR16RegClass.hasSubClassEq(RC) || + X86::GR32RegClass.hasSubClassEq(RC) || + X86::GR64RegClass.hasSubClassEq(RC)) { + // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy + // Bridge. Probably Ivy Bridge as well. + CondCycles = 2; + TrueCycles = 2; + FalseCycles = 2; + return true; + } + + // Can't do vectors. + return false; +} + +void X86InstrInfo::insertSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DstReg, + const SmallVectorImpl<MachineOperand> &Cond, + unsigned TrueReg, unsigned FalseReg) const { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + assert(Cond.size() == 1 && "Invalid Cond array"); + unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(), + MRI.getRegClass(DstReg)->getSize(), + false/*HasMemoryOperand*/); + BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg); +} + +/// isHReg - Test if the given register is a physical h register. +static bool isHReg(unsigned Reg) { + return X86::GR8_ABCD_HRegClass.contains(Reg); +} + +// Try and copy between VR128/VR64 and GR64 registers. +static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, + bool HasAVX) { + // SrcReg(VR128) -> DestReg(GR64) + // SrcReg(VR64) -> DestReg(GR64) + // SrcReg(GR64) -> DestReg(VR128) + // SrcReg(GR64) -> DestReg(VR64) + + if (X86::GR64RegClass.contains(DestReg)) { + if (X86::VR128RegClass.contains(SrcReg)) + // Copy from a VR128 register to a GR64 register. + return HasAVX ? X86::VMOVPQIto64rr : X86::MOVPQIto64rr; + if (X86::VR64RegClass.contains(SrcReg)) + // Copy from a VR64 register to a GR64 register. + return X86::MOVSDto64rr; + } else if (X86::GR64RegClass.contains(SrcReg)) { + // Copy from a GR64 register to a VR128 register. + if (X86::VR128RegClass.contains(DestReg)) + return HasAVX ? X86::VMOV64toPQIrr : X86::MOV64toPQIrr; + // Copy from a GR64 register to a VR64 register. + if (X86::VR64RegClass.contains(DestReg)) + return X86::MOV64toSDrr; + } + + // SrcReg(FR32) -> DestReg(GR32) + // SrcReg(GR32) -> DestReg(FR32) + + if (X86::GR32RegClass.contains(DestReg) && X86::FR32RegClass.contains(SrcReg)) + // Copy from a FR32 register to a GR32 register. + return HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr; + + if (X86::FR32RegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg)) + // Copy from a GR32 register to a FR32 register. + return HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr; + + return 0; +} + +void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + // First deal with the normal symmetric copies. + bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + unsigned Opc; + if (X86::GR64RegClass.contains(DestReg, SrcReg)) + Opc = X86::MOV64rr; + else if (X86::GR32RegClass.contains(DestReg, SrcReg)) + Opc = X86::MOV32rr; + else if (X86::GR16RegClass.contains(DestReg, SrcReg)) + Opc = X86::MOV16rr; + else if (X86::GR8RegClass.contains(DestReg, SrcReg)) { + // Copying to or from a physical H register on x86-64 requires a NOREX + // move. Otherwise use a normal move. + if ((isHReg(DestReg) || isHReg(SrcReg)) && + TM.getSubtarget<X86Subtarget>().is64Bit()) { + Opc = X86::MOV8rr_NOREX; + // Both operands must be encodable without an REX prefix. + assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) && + "8-bit H register can not be copied outside GR8_NOREX"); + } else + Opc = X86::MOV8rr; + } else if (X86::VR128RegClass.contains(DestReg, SrcReg)) + Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr; + else if (X86::VR256RegClass.contains(DestReg, SrcReg)) + Opc = X86::VMOVAPSYrr; + else if (X86::VR64RegClass.contains(DestReg, SrcReg)) + Opc = X86::MMX_MOVQ64rr; + else + Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, HasAVX); + + if (Opc) { + BuildMI(MBB, MI, DL, get(Opc), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + + // Moving EFLAGS to / from another register requires a push and a pop. + // Notice that we have to adjust the stack if we don't want to clobber the + // first frame index. See X86FrameLowering.cpp - colobbersTheStack. + if (SrcReg == X86::EFLAGS) { + if (X86::GR64RegClass.contains(DestReg)) { + BuildMI(MBB, MI, DL, get(X86::PUSHF64)); + BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg); + return; + } + if (X86::GR32RegClass.contains(DestReg)) { + BuildMI(MBB, MI, DL, get(X86::PUSHF32)); + BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg); + return; + } + } + if (DestReg == X86::EFLAGS) { + if (X86::GR64RegClass.contains(SrcReg)) { + BuildMI(MBB, MI, DL, get(X86::PUSH64r)) + .addReg(SrcReg, getKillRegState(KillSrc)); + BuildMI(MBB, MI, DL, get(X86::POPF64)); + return; + } + if (X86::GR32RegClass.contains(SrcReg)) { + BuildMI(MBB, MI, DL, get(X86::PUSH32r)) + .addReg(SrcReg, getKillRegState(KillSrc)); + BuildMI(MBB, MI, DL, get(X86::POPF32)); + return; + } + } + + DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) + << " to " << RI.getName(DestReg) << '\n'); + llvm_unreachable("Cannot emit physreg copy instruction"); +} + +static unsigned getLoadStoreRegOpcode(unsigned Reg, + const TargetRegisterClass *RC, + bool isStackAligned, + const TargetMachine &TM, + bool load) { + bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + switch (RC->getSize()) { + default: + llvm_unreachable("Unknown spill size"); + case 1: + assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass"); + if (TM.getSubtarget<X86Subtarget>().is64Bit()) + // Copying to or from a physical H register on x86-64 requires a NOREX + // move. Otherwise use a normal move. + if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC)) + return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX; + return load ? X86::MOV8rm : X86::MOV8mr; + case 2: + assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass"); + return load ? X86::MOV16rm : X86::MOV16mr; + case 4: + if (X86::GR32RegClass.hasSubClassEq(RC)) + return load ? X86::MOV32rm : X86::MOV32mr; + if (X86::FR32RegClass.hasSubClassEq(RC)) + return load ? + (HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) : + (HasAVX ? X86::VMOVSSmr : X86::MOVSSmr); + if (X86::RFP32RegClass.hasSubClassEq(RC)) + return load ? X86::LD_Fp32m : X86::ST_Fp32m; + llvm_unreachable("Unknown 4-byte regclass"); + case 8: + if (X86::GR64RegClass.hasSubClassEq(RC)) + return load ? X86::MOV64rm : X86::MOV64mr; + if (X86::FR64RegClass.hasSubClassEq(RC)) + return load ? + (HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) : + (HasAVX ? X86::VMOVSDmr : X86::MOVSDmr); + if (X86::VR64RegClass.hasSubClassEq(RC)) + return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr; + if (X86::RFP64RegClass.hasSubClassEq(RC)) + return load ? X86::LD_Fp64m : X86::ST_Fp64m; + llvm_unreachable("Unknown 8-byte regclass"); + case 10: + assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass"); + return load ? X86::LD_Fp80m : X86::ST_FpP80m; + case 16: { + assert(X86::VR128RegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass"); + // If stack is realigned we can use aligned stores. + if (isStackAligned) + return load ? + (HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm) : + (HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr); + else + return load ? + (HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm) : + (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr); + } + case 32: + assert(X86::VR256RegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass"); + // If stack is realigned we can use aligned stores. + if (isStackAligned) + return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr; + else + return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr; + } +} + +static unsigned getStoreRegOpcode(unsigned SrcReg, + const TargetRegisterClass *RC, + bool isStackAligned, + TargetMachine &TM) { + return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, TM, false); +} + + +static unsigned getLoadRegOpcode(unsigned DestReg, + const TargetRegisterClass *RC, + bool isStackAligned, + const TargetMachine &TM) { + return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, TM, true); +} + +void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, int FrameIdx, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + const MachineFunction &MF = *MBB.getParent(); + assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() && + "Stack slot too small for store"); + unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) || + RI.canRealignStack(MF); + unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM); + DebugLoc DL = MBB.findDebugLoc(MI); + addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx) + .addReg(SrcReg, getKillRegState(isKill)); +} + +void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg, + bool isKill, + SmallVectorImpl<MachineOperand> &Addr, + const TargetRegisterClass *RC, + MachineInstr::mmo_iterator MMOBegin, + MachineInstr::mmo_iterator MMOEnd, + SmallVectorImpl<MachineInstr*> &NewMIs) const { + unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + bool isAligned = MMOBegin != MMOEnd && + (*MMOBegin)->getAlignment() >= Alignment; + unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM); + DebugLoc DL; + MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc)); + for (unsigned i = 0, e = Addr.size(); i != e; ++i) + MIB.addOperand(Addr[i]); + MIB.addReg(SrcReg, getKillRegState(isKill)); + (*MIB).setMemRefs(MMOBegin, MMOEnd); + NewMIs.push_back(MIB); +} + + +void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIdx, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + const MachineFunction &MF = *MBB.getParent(); + unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) || + RI.canRealignStack(MF); + unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM); + DebugLoc DL = MBB.findDebugLoc(MI); + addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx); +} + +void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg, + SmallVectorImpl<MachineOperand> &Addr, + const TargetRegisterClass *RC, + MachineInstr::mmo_iterator MMOBegin, + MachineInstr::mmo_iterator MMOEnd, + SmallVectorImpl<MachineInstr*> &NewMIs) const { + unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + bool isAligned = MMOBegin != MMOEnd && + (*MMOBegin)->getAlignment() >= Alignment; + unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM); + DebugLoc DL; + MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg); + for (unsigned i = 0, e = Addr.size(); i != e; ++i) + MIB.addOperand(Addr[i]); + (*MIB).setMemRefs(MMOBegin, MMOEnd); + NewMIs.push_back(MIB); +} + +bool X86InstrInfo:: +analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2, + int &CmpMask, int &CmpValue) const { + switch (MI->getOpcode()) { + default: break; + case X86::CMP64ri32: + case X86::CMP64ri8: + case X86::CMP32ri: + case X86::CMP32ri8: + case X86::CMP16ri: + case X86::CMP16ri8: + case X86::CMP8ri: + SrcReg = MI->getOperand(0).getReg(); + SrcReg2 = 0; + CmpMask = ~0; + CmpValue = MI->getOperand(1).getImm(); + return true; + // A SUB can be used to perform comparison. + case X86::SUB64rm: + case X86::SUB32rm: + case X86::SUB16rm: + case X86::SUB8rm: + SrcReg = MI->getOperand(1).getReg(); + SrcReg2 = 0; + CmpMask = ~0; + CmpValue = 0; + return true; + case X86::SUB64rr: + case X86::SUB32rr: + case X86::SUB16rr: + case X86::SUB8rr: + SrcReg = MI->getOperand(1).getReg(); + SrcReg2 = MI->getOperand(2).getReg(); + CmpMask = ~0; + CmpValue = 0; + return true; + case X86::SUB64ri32: + case X86::SUB64ri8: + case X86::SUB32ri: + case X86::SUB32ri8: + case X86::SUB16ri: + case X86::SUB16ri8: + case X86::SUB8ri: + SrcReg = MI->getOperand(1).getReg(); + SrcReg2 = 0; + CmpMask = ~0; + CmpValue = MI->getOperand(2).getImm(); + return true; + case X86::CMP64rr: + case X86::CMP32rr: + case X86::CMP16rr: + case X86::CMP8rr: + SrcReg = MI->getOperand(0).getReg(); + SrcReg2 = MI->getOperand(1).getReg(); + CmpMask = ~0; + CmpValue = 0; + return true; + case X86::TEST8rr: + case X86::TEST16rr: + case X86::TEST32rr: + case X86::TEST64rr: + SrcReg = MI->getOperand(0).getReg(); + if (MI->getOperand(1).getReg() != SrcReg) return false; + // Compare against zero. + SrcReg2 = 0; + CmpMask = ~0; + CmpValue = 0; + return true; + } + return false; +} + +/// isRedundantFlagInstr - check whether the first instruction, whose only +/// purpose is to update flags, can be made redundant. +/// CMPrr can be made redundant by SUBrr if the operands are the same. +/// This function can be extended later on. +/// SrcReg, SrcRegs: register operands for FlagI. +/// ImmValue: immediate for FlagI if it takes an immediate. +inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg, + unsigned SrcReg2, int ImmValue, + MachineInstr *OI) { + if (((FlagI->getOpcode() == X86::CMP64rr && + OI->getOpcode() == X86::SUB64rr) || + (FlagI->getOpcode() == X86::CMP32rr && + OI->getOpcode() == X86::SUB32rr)|| + (FlagI->getOpcode() == X86::CMP16rr && + OI->getOpcode() == X86::SUB16rr)|| + (FlagI->getOpcode() == X86::CMP8rr && + OI->getOpcode() == X86::SUB8rr)) && + ((OI->getOperand(1).getReg() == SrcReg && + OI->getOperand(2).getReg() == SrcReg2) || + (OI->getOperand(1).getReg() == SrcReg2 && + OI->getOperand(2).getReg() == SrcReg))) + return true; + + if (((FlagI->getOpcode() == X86::CMP64ri32 && + OI->getOpcode() == X86::SUB64ri32) || + (FlagI->getOpcode() == X86::CMP64ri8 && + OI->getOpcode() == X86::SUB64ri8) || + (FlagI->getOpcode() == X86::CMP32ri && + OI->getOpcode() == X86::SUB32ri) || + (FlagI->getOpcode() == X86::CMP32ri8 && + OI->getOpcode() == X86::SUB32ri8) || + (FlagI->getOpcode() == X86::CMP16ri && + OI->getOpcode() == X86::SUB16ri) || + (FlagI->getOpcode() == X86::CMP16ri8 && + OI->getOpcode() == X86::SUB16ri8) || + (FlagI->getOpcode() == X86::CMP8ri && + OI->getOpcode() == X86::SUB8ri)) && + OI->getOperand(1).getReg() == SrcReg && + OI->getOperand(2).getImm() == ImmValue) + return true; + return false; +} + +/// isDefConvertible - check whether the definition can be converted +/// to remove a comparison against zero. +inline static bool isDefConvertible(MachineInstr *MI) { + switch (MI->getOpcode()) { + default: return false; + case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri: + case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8: + case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr: + case X86::SUB16rr: case X86::SUB8rr: case X86::SUB64rm: + case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm: + case X86::DEC64r: case X86::DEC32r: case X86::DEC16r: case X86::DEC8r: + case X86::DEC64_32r: case X86::DEC64_16r: + case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri: + case X86::ADD32ri8: case X86::ADD16ri: case X86::ADD16ri8: + case X86::ADD8ri: case X86::ADD64rr: case X86::ADD32rr: + case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm: + case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm: + case X86::INC64r: case X86::INC32r: case X86::INC16r: case X86::INC8r: + case X86::INC64_32r: case X86::INC64_16r: + case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri: + case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8: + case X86::AND8ri: case X86::AND64rr: case X86::AND32rr: + case X86::AND16rr: case X86::AND8rr: case X86::AND64rm: + case X86::AND32rm: case X86::AND16rm: case X86::AND8rm: + case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri: + case X86::XOR32ri8: case X86::XOR16ri: case X86::XOR16ri8: + case X86::XOR8ri: case X86::XOR64rr: case X86::XOR32rr: + case X86::XOR16rr: case X86::XOR8rr: case X86::XOR64rm: + case X86::XOR32rm: case X86::XOR16rm: case X86::XOR8rm: + case X86::OR64ri32: case X86::OR64ri8: case X86::OR32ri: + case X86::OR32ri8: case X86::OR16ri: case X86::OR16ri8: + case X86::OR8ri: case X86::OR64rr: case X86::OR32rr: + case X86::OR16rr: case X86::OR8rr: case X86::OR64rm: + case X86::OR32rm: case X86::OR16rm: case X86::OR8rm: + case X86::ANDN32rr: case X86::ANDN32rm: + case X86::ANDN64rr: case X86::ANDN64rm: + return true; + } +} + +/// optimizeCompareInstr - Check if there exists an earlier instruction that +/// operates on the same source operands and sets flags in the same way as +/// Compare; remove Compare if possible. +bool X86InstrInfo:: +optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, + int CmpMask, int CmpValue, + const MachineRegisterInfo *MRI) const { + // Check whether we can replace SUB with CMP. + unsigned NewOpcode = 0; + switch (CmpInstr->getOpcode()) { + default: break; + case X86::SUB64ri32: + case X86::SUB64ri8: + case X86::SUB32ri: + case X86::SUB32ri8: + case X86::SUB16ri: + case X86::SUB16ri8: + case X86::SUB8ri: + case X86::SUB64rm: + case X86::SUB32rm: + case X86::SUB16rm: + case X86::SUB8rm: + case X86::SUB64rr: + case X86::SUB32rr: + case X86::SUB16rr: + case X86::SUB8rr: { + if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg())) + return false; + // There is no use of the destination register, we can replace SUB with CMP. + switch (CmpInstr->getOpcode()) { + default: llvm_unreachable("Unreachable!"); + case X86::SUB64rm: NewOpcode = X86::CMP64rm; break; + case X86::SUB32rm: NewOpcode = X86::CMP32rm; break; + case X86::SUB16rm: NewOpcode = X86::CMP16rm; break; + case X86::SUB8rm: NewOpcode = X86::CMP8rm; break; + case X86::SUB64rr: NewOpcode = X86::CMP64rr; break; + case X86::SUB32rr: NewOpcode = X86::CMP32rr; break; + case X86::SUB16rr: NewOpcode = X86::CMP16rr; break; + case X86::SUB8rr: NewOpcode = X86::CMP8rr; break; + case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break; + case X86::SUB64ri8: NewOpcode = X86::CMP64ri8; break; + case X86::SUB32ri: NewOpcode = X86::CMP32ri; break; + case X86::SUB32ri8: NewOpcode = X86::CMP32ri8; break; + case X86::SUB16ri: NewOpcode = X86::CMP16ri; break; + case X86::SUB16ri8: NewOpcode = X86::CMP16ri8; break; + case X86::SUB8ri: NewOpcode = X86::CMP8ri; break; + } + CmpInstr->setDesc(get(NewOpcode)); + CmpInstr->RemoveOperand(0); + // Fall through to optimize Cmp if Cmp is CMPrr or CMPri. + if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm || + NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm) + return false; + } + } + + // Get the unique definition of SrcReg. + MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); + if (!MI) return false; + + // CmpInstr is the first instruction of the BB. + MachineBasicBlock::iterator I = CmpInstr, Def = MI; + + // If we are comparing against zero, check whether we can use MI to update + // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize. + bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0); + if (IsCmpZero && (MI->getParent() != CmpInstr->getParent() || + !isDefConvertible(MI))) + return false; + + // We are searching for an earlier instruction that can make CmpInstr + // redundant and that instruction will be saved in Sub. + MachineInstr *Sub = NULL; + const TargetRegisterInfo *TRI = &getRegisterInfo(); + + // We iterate backward, starting from the instruction before CmpInstr and + // stop when reaching the definition of a source register or done with the BB. + // RI points to the instruction before CmpInstr. + // If the definition is in this basic block, RE points to the definition; + // otherwise, RE is the rend of the basic block. + MachineBasicBlock::reverse_iterator + RI = MachineBasicBlock::reverse_iterator(I), + RE = CmpInstr->getParent() == MI->getParent() ? + MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ : + CmpInstr->getParent()->rend(); + MachineInstr *Movr0Inst = 0; + for (; RI != RE; ++RI) { + MachineInstr *Instr = &*RI; + // Check whether CmpInstr can be made redundant by the current instruction. + if (!IsCmpZero && + isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) { + Sub = Instr; + break; + } + + if (Instr->modifiesRegister(X86::EFLAGS, TRI) || + Instr->readsRegister(X86::EFLAGS, TRI)) { + // This instruction modifies or uses EFLAGS. + + // MOV32r0 etc. are implemented with xor which clobbers condition code. + // They are safe to move up, if the definition to EFLAGS is dead and + // earlier instructions do not read or write EFLAGS. + if (!Movr0Inst && (Instr->getOpcode() == X86::MOV8r0 || + Instr->getOpcode() == X86::MOV16r0 || + Instr->getOpcode() == X86::MOV32r0 || + Instr->getOpcode() == X86::MOV64r0) && + Instr->registerDefIsDead(X86::EFLAGS, TRI)) { + Movr0Inst = Instr; + continue; + } + + // We can't remove CmpInstr. + return false; + } + } + + // Return false if no candidates exist. + if (!IsCmpZero && !Sub) + return false; + + bool IsSwapped = (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 && + Sub->getOperand(2).getReg() == SrcReg); + + // Scan forward from the instruction after CmpInstr for uses of EFLAGS. + // It is safe to remove CmpInstr if EFLAGS is redefined or killed. + // If we are done with the basic block, we need to check whether EFLAGS is + // live-out. + bool IsSafe = false; + SmallVector<std::pair<MachineInstr*, unsigned /*NewOpc*/>, 4> OpsToUpdate; + MachineBasicBlock::iterator E = CmpInstr->getParent()->end(); + for (++I; I != E; ++I) { + const MachineInstr &Instr = *I; + bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI); + bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI); + // We should check the usage if this instruction uses and updates EFLAGS. + if (!UseEFLAGS && ModifyEFLAGS) { + // It is safe to remove CmpInstr if EFLAGS is updated again. + IsSafe = true; + break; + } + if (!UseEFLAGS && !ModifyEFLAGS) + continue; + + // EFLAGS is used by this instruction. + X86::CondCode OldCC; + bool OpcIsSET = false; + if (IsCmpZero || IsSwapped) { + // We decode the condition code from opcode. + if (Instr.isBranch()) + OldCC = getCondFromBranchOpc(Instr.getOpcode()); + else { + OldCC = getCondFromSETOpc(Instr.getOpcode()); + if (OldCC != X86::COND_INVALID) + OpcIsSET = true; + else + OldCC = X86::getCondFromCMovOpc(Instr.getOpcode()); + } + if (OldCC == X86::COND_INVALID) return false; + } + if (IsCmpZero) { + switch (OldCC) { + default: break; + case X86::COND_A: case X86::COND_AE: + case X86::COND_B: case X86::COND_BE: + case X86::COND_G: case X86::COND_GE: + case X86::COND_L: case X86::COND_LE: + case X86::COND_O: case X86::COND_NO: + // CF and OF are used, we can't perform this optimization. + return false; + } + } else if (IsSwapped) { + // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs + // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc. + // We swap the condition code and synthesize the new opcode. + X86::CondCode NewCC = getSwappedCondition(OldCC); + if (NewCC == X86::COND_INVALID) return false; + + // Synthesize the new opcode. + bool HasMemoryOperand = Instr.hasOneMemOperand(); + unsigned NewOpc; + if (Instr.isBranch()) + NewOpc = GetCondBranchFromCond(NewCC); + else if(OpcIsSET) + NewOpc = getSETFromCond(NewCC, HasMemoryOperand); + else { + unsigned DstReg = Instr.getOperand(0).getReg(); + NewOpc = getCMovFromCond(NewCC, MRI->getRegClass(DstReg)->getSize(), + HasMemoryOperand); + } + + // Push the MachineInstr to OpsToUpdate. + // If it is safe to remove CmpInstr, the condition code of these + // instructions will be modified. + OpsToUpdate.push_back(std::make_pair(&*I, NewOpc)); + } + if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) { + // It is safe to remove CmpInstr if EFLAGS is updated again or killed. + IsSafe = true; + break; + } + } + + // If EFLAGS is not killed nor re-defined, we should check whether it is + // live-out. If it is live-out, do not optimize. + if ((IsCmpZero || IsSwapped) && !IsSafe) { + MachineBasicBlock *MBB = CmpInstr->getParent(); + for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(), + SE = MBB->succ_end(); SI != SE; ++SI) + if ((*SI)->isLiveIn(X86::EFLAGS)) + return false; + } + + // The instruction to be updated is either Sub or MI. + Sub = IsCmpZero ? MI : Sub; + // Move Movr0Inst to the place right before Sub. + if (Movr0Inst) { + Sub->getParent()->remove(Movr0Inst); + Sub->getParent()->insert(MachineBasicBlock::iterator(Sub), Movr0Inst); + } + + // Make sure Sub instruction defines EFLAGS and mark the def live. + unsigned LastOperand = Sub->getNumOperands() - 1; + assert(Sub->getNumOperands() >= 2 && + Sub->getOperand(LastOperand).isReg() && + Sub->getOperand(LastOperand).getReg() == X86::EFLAGS && + "EFLAGS should be the last operand of SUB, ADD, OR, XOR, AND"); + Sub->getOperand(LastOperand).setIsDef(true); + Sub->getOperand(LastOperand).setIsDead(false); + CmpInstr->eraseFromParent(); + + // Modify the condition code of instructions in OpsToUpdate. + for (unsigned i = 0, e = OpsToUpdate.size(); i < e; i++) + OpsToUpdate[i].first->setDesc(get(OpsToUpdate[i].second)); + return true; +} + +/// optimizeLoadInstr - Try to remove the load by folding it to a register +/// operand at the use. We fold the load instructions if load defines a virtual +/// register, the virtual register is used once in the same BB, and the +/// instructions in-between do not load or store, and have no side effects. +MachineInstr* X86InstrInfo:: +optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI, + unsigned &FoldAsLoadDefReg, + MachineInstr *&DefMI) const { + if (FoldAsLoadDefReg == 0) + return 0; + // To be conservative, if there exists another load, clear the load candidate. + if (MI->mayLoad()) { + FoldAsLoadDefReg = 0; + return 0; + } + + // Check whether we can move DefMI here. + DefMI = MRI->getVRegDef(FoldAsLoadDefReg); + assert(DefMI); + bool SawStore = false; + if (!DefMI->isSafeToMove(this, 0, SawStore)) + return 0; + + // We try to commute MI if possible. + unsigned IdxEnd = (MI->isCommutable()) ? 2 : 1; + for (unsigned Idx = 0; Idx < IdxEnd; Idx++) { + // Collect information about virtual register operands of MI. + unsigned SrcOperandId = 0; + bool FoundSrcOperand = false; + for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (Reg != FoldAsLoadDefReg) + continue; + // Do not fold if we have a subreg use or a def or multiple uses. + if (MO.getSubReg() || MO.isDef() || FoundSrcOperand) + return 0; + + SrcOperandId = i; + FoundSrcOperand = true; + } + if (!FoundSrcOperand) return 0; + + // Check whether we can fold the def into SrcOperandId. + SmallVector<unsigned, 8> Ops; + Ops.push_back(SrcOperandId); + MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI); + if (FoldMI) { + FoldAsLoadDefReg = 0; + return FoldMI; + } + + if (Idx == 1) { + // MI was changed but it didn't help, commute it back! + commuteInstruction(MI, false); + return 0; + } + + // Check whether we can commute MI and enable folding. + if (MI->isCommutable()) { + MachineInstr *NewMI = commuteInstruction(MI, false); + // Unable to commute. + if (!NewMI) return 0; + if (NewMI != MI) { + // New instruction. It doesn't need to be kept. + NewMI->eraseFromParent(); + return 0; + } + } + } + return 0; +} + +/// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr +/// instruction with two undef reads of the register being defined. This is +/// used for mapping: +/// %xmm4 = V_SET0 +/// to: +/// %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef> +/// +static bool Expand2AddrUndef(MachineInstrBuilder &MIB, + const MCInstrDesc &Desc) { + assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction."); + unsigned Reg = MIB->getOperand(0).getReg(); + MIB->setDesc(Desc); + + // MachineInstr::addOperand() will insert explicit operands before any + // implicit operands. + MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); + // But we don't trust that. + assert(MIB->getOperand(1).getReg() == Reg && + MIB->getOperand(2).getReg() == Reg && "Misplaced operand"); + return true; +} + +bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { + bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); + switch (MI->getOpcode()) { + case X86::SETB_C8r: + return Expand2AddrUndef(MIB, get(X86::SBB8rr)); + case X86::SETB_C16r: + return Expand2AddrUndef(MIB, get(X86::SBB16rr)); + case X86::SETB_C32r: + return Expand2AddrUndef(MIB, get(X86::SBB32rr)); + case X86::SETB_C64r: + return Expand2AddrUndef(MIB, get(X86::SBB64rr)); + case X86::V_SET0: + case X86::FsFLD0SS: + case X86::FsFLD0SD: + return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr)); + case X86::AVX_SET0: + assert(HasAVX && "AVX not supported"); + return Expand2AddrUndef(MIB, get(X86::VXORPSYrr)); + case X86::V_SETALLONES: + return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); + case X86::AVX2_SETALLONES: + return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr)); + case X86::TEST8ri_NOREX: + MI->setDesc(get(X86::TEST8ri)); + return true; + } + return false; +} + +MachineInstr* +X86InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, + int FrameIx, uint64_t Offset, + const MDNode *MDPtr, + DebugLoc DL) const { + X86AddressMode AM; + AM.BaseType = X86AddressMode::FrameIndexBase; + AM.Base.FrameIndex = FrameIx; + MachineInstrBuilder MIB = BuildMI(MF, DL, get(X86::DBG_VALUE)); + addFullAddress(MIB, AM).addImm(Offset).addMetadata(MDPtr); + return &*MIB; +} + +static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, + const SmallVectorImpl<MachineOperand> &MOs, + MachineInstr *MI, + const TargetInstrInfo &TII) { + // Create the base instruction with the memory operand as the first part. + // Omit the implicit operands, something BuildMI can't do. + MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), + MI->getDebugLoc(), true); + MachineInstrBuilder MIB(MF, NewMI); + unsigned NumAddrOps = MOs.size(); + for (unsigned i = 0; i != NumAddrOps; ++i) + MIB.addOperand(MOs[i]); + if (NumAddrOps < 4) // FrameIndex only + addOffset(MIB, 0); + + // Loop over the rest of the ri operands, converting them over. + unsigned NumOps = MI->getDesc().getNumOperands()-2; + for (unsigned i = 0; i != NumOps; ++i) { + MachineOperand &MO = MI->getOperand(i+2); + MIB.addOperand(MO); + } + for (unsigned i = NumOps+2, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + MIB.addOperand(MO); + } + return MIB; +} + +static MachineInstr *FuseInst(MachineFunction &MF, + unsigned Opcode, unsigned OpNo, + const SmallVectorImpl<MachineOperand> &MOs, + MachineInstr *MI, const TargetInstrInfo &TII) { + // Omit the implicit operands, something BuildMI can't do. + MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), + MI->getDebugLoc(), true); + MachineInstrBuilder MIB(MF, NewMI); + + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (i == OpNo) { + assert(MO.isReg() && "Expected to fold into reg operand!"); + unsigned NumAddrOps = MOs.size(); + for (unsigned i = 0; i != NumAddrOps; ++i) + MIB.addOperand(MOs[i]); + if (NumAddrOps < 4) // FrameIndex only + addOffset(MIB, 0); + } else { + MIB.addOperand(MO); + } + } + return MIB; +} + +static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, + const SmallVectorImpl<MachineOperand> &MOs, + MachineInstr *MI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(Opcode)); + + unsigned NumAddrOps = MOs.size(); + for (unsigned i = 0; i != NumAddrOps; ++i) + MIB.addOperand(MOs[i]); + if (NumAddrOps < 4) // FrameIndex only + addOffset(MIB, 0); + return MIB.addImm(0); +} + +MachineInstr* +X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, unsigned i, + const SmallVectorImpl<MachineOperand> &MOs, + unsigned Size, unsigned Align) const { + const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = 0; + bool isCallRegIndirect = TM.getSubtarget<X86Subtarget>().callRegIndirect(); + bool isTwoAddrFold = false; + + // Atom favors register form of call. So, we do not fold loads into calls + // when X86Subtarget is Atom. + if (isCallRegIndirect && + (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r)) { + return NULL; + } + + unsigned NumOps = MI->getDesc().getNumOperands(); + bool isTwoAddr = NumOps > 1 && + MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1; + + // FIXME: AsmPrinter doesn't know how to handle + // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. + if (MI->getOpcode() == X86::ADD32ri && + MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) + return NULL; + + MachineInstr *NewMI = NULL; + // Folding a memory location into the two-address part of a two-address + // instruction is different than folding it other places. It requires + // replacing the *two* registers with the memory location. + if (isTwoAddr && NumOps >= 2 && i < 2 && + MI->getOperand(0).isReg() && + MI->getOperand(1).isReg() && + MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) { + OpcodeTablePtr = &RegOp2MemOpTable2Addr; + isTwoAddrFold = true; + } else if (i == 0) { // If operand 0 + unsigned Opc = 0; + switch (MI->getOpcode()) { + default: break; + case X86::MOV64r0: Opc = X86::MOV64mi32; break; + case X86::MOV32r0: Opc = X86::MOV32mi; break; + case X86::MOV16r0: Opc = X86::MOV16mi; break; + case X86::MOV8r0: Opc = X86::MOV8mi; break; + } + if (Opc) + NewMI = MakeM0Inst(*this, Opc, MOs, MI); + if (NewMI) + return NewMI; + + OpcodeTablePtr = &RegOp2MemOpTable0; + } else if (i == 1) { + OpcodeTablePtr = &RegOp2MemOpTable1; + } else if (i == 2) { + OpcodeTablePtr = &RegOp2MemOpTable2; + } else if (i == 3) { + OpcodeTablePtr = &RegOp2MemOpTable3; + } + + // If table selected... + if (OpcodeTablePtr) { + // Find the Opcode to fuse + DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I = + OpcodeTablePtr->find(MI->getOpcode()); + if (I != OpcodeTablePtr->end()) { + unsigned Opcode = I->second.first; + unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT; + if (Align < MinAlign) + return NULL; + bool NarrowToMOV32rm = false; + if (Size) { + unsigned RCSize = getRegClass(MI->getDesc(), i, &RI, MF)->getSize(); + if (Size < RCSize) { + // Check if it's safe to fold the load. If the size of the object is + // narrower than the load width, then it's not. + if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4) + return NULL; + // If this is a 64-bit load, but the spill slot is 32, then we can do + // a 32-bit load which is implicitly zero-extended. This likely is due + // to liveintervalanalysis remat'ing a load from stack slot. + if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg()) + return NULL; + Opcode = X86::MOV32rm; + NarrowToMOV32rm = true; + } + } + + if (isTwoAddrFold) + NewMI = FuseTwoAddrInst(MF, Opcode, MOs, MI, *this); + else + NewMI = FuseInst(MF, Opcode, i, MOs, MI, *this); + + if (NarrowToMOV32rm) { + // If this is the special case where we use a MOV32rm to load a 32-bit + // value and zero-extend the top bits. Change the destination register + // to a 32-bit one. + unsigned DstReg = NewMI->getOperand(0).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(DstReg)) + NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, + X86::sub_32bit)); + else + NewMI->getOperand(0).setSubReg(X86::sub_32bit); + } + return NewMI; + } + } + + // No fusion + if (PrintFailedFusing && !MI->isCopy()) + dbgs() << "We failed to fuse operand " << i << " in " << *MI; + return NULL; +} + +/// hasPartialRegUpdate - Return true for all instructions that only update +/// the first 32 or 64-bits of the destination register and leave the rest +/// unmodified. This can be used to avoid folding loads if the instructions +/// only update part of the destination register, and the non-updated part is +/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these +/// instructions breaks the partial register dependency and it can improve +/// performance. e.g.: +/// +/// movss (%rdi), %xmm0 +/// cvtss2sd %xmm0, %xmm0 +/// +/// Instead of +/// cvtss2sd (%rdi), %xmm0 +/// +/// FIXME: This should be turned into a TSFlags. +/// +static bool hasPartialRegUpdate(unsigned Opcode) { + switch (Opcode) { + case X86::CVTSI2SSrr: + case X86::CVTSI2SS64rr: + case X86::CVTSI2SDrr: + case X86::CVTSI2SD64rr: + case X86::CVTSD2SSrr: + case X86::Int_CVTSD2SSrr: + case X86::CVTSS2SDrr: + case X86::Int_CVTSS2SDrr: + case X86::RCPSSr: + case X86::RCPSSr_Int: + case X86::ROUNDSDr: + case X86::ROUNDSDr_Int: + case X86::ROUNDSSr: + case X86::ROUNDSSr_Int: + case X86::RSQRTSSr: + case X86::RSQRTSSr_Int: + case X86::SQRTSSr: + case X86::SQRTSSr_Int: + // AVX encoded versions + case X86::VCVTSD2SSrr: + case X86::Int_VCVTSD2SSrr: + case X86::VCVTSS2SDrr: + case X86::Int_VCVTSS2SDrr: + case X86::VRCPSSr: + case X86::VROUNDSDr: + case X86::VROUNDSDr_Int: + case X86::VROUNDSSr: + case X86::VROUNDSSr_Int: + case X86::VRSQRTSSr: + case X86::VSQRTSSr: + return true; + } + + return false; +} + +/// getPartialRegUpdateClearance - Inform the ExeDepsFix pass how many idle +/// instructions we would like before a partial register update. +unsigned X86InstrInfo:: +getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum, + const TargetRegisterInfo *TRI) const { + if (OpNum != 0 || !hasPartialRegUpdate(MI->getOpcode())) + return 0; + + // If MI is marked as reading Reg, the partial register update is wanted. + const MachineOperand &MO = MI->getOperand(0); + unsigned Reg = MO.getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (MO.readsReg() || MI->readsVirtualRegister(Reg)) + return 0; + } else { + if (MI->readsRegister(Reg, TRI)) + return 0; + } + + // If any of the preceding 16 instructions are reading Reg, insert a + // dependency breaking instruction. The magic number is based on a few + // Nehalem experiments. + return 16; +} + +void X86InstrInfo:: +breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, + const TargetRegisterInfo *TRI) const { + unsigned Reg = MI->getOperand(OpNum).getReg(); + if (X86::VR128RegClass.contains(Reg)) { + // These instructions are all floating point domain, so xorps is the best + // choice. + bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + unsigned Opc = HasAVX ? X86::VXORPSrr : X86::XORPSrr; + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg) + .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); + } else if (X86::VR256RegClass.contains(Reg)) { + // Use vxorps to clear the full ymm register. + // It wants to read and write the xmm sub-register. + unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(X86::VXORPSrr), XReg) + .addReg(XReg, RegState::Undef).addReg(XReg, RegState::Undef) + .addReg(Reg, RegState::ImplicitDefine); + } else + return; + MI->addRegisterKilled(Reg, TRI, true); +} + +MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops, + int FrameIndex) const { + // Check switch flag + if (NoFusing) return NULL; + + // Unless optimizing for size, don't fold to avoid partial + // register update stalls + if (!MF.getFunction()->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) && + hasPartialRegUpdate(MI->getOpcode())) + return 0; + + const MachineFrameInfo *MFI = MF.getFrameInfo(); + unsigned Size = MFI->getObjectSize(FrameIndex); + unsigned Alignment = MFI->getObjectAlignment(FrameIndex); + if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { + unsigned NewOpc = 0; + unsigned RCSize = 0; + switch (MI->getOpcode()) { + default: return NULL; + case X86::TEST8rr: NewOpc = X86::CMP8ri; RCSize = 1; break; + case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break; + case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break; + case X86::TEST64rr: NewOpc = X86::CMP64ri8; RCSize = 8; break; + } + // Check if it's safe to fold the load. If the size of the object is + // narrower than the load width, then it's not. + if (Size < RCSize) + return NULL; + // Change to CMPXXri r, 0 first. + MI->setDesc(get(NewOpc)); + MI->getOperand(1).ChangeToImmediate(0); + } else if (Ops.size() != 1) + return NULL; + + SmallVector<MachineOperand,4> MOs; + MOs.push_back(MachineOperand::CreateFI(FrameIndex)); + return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, Size, Alignment); +} + +MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops, + MachineInstr *LoadMI) const { + // Check switch flag + if (NoFusing) return NULL; + + // Unless optimizing for size, don't fold to avoid partial + // register update stalls + if (!MF.getFunction()->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) && + hasPartialRegUpdate(MI->getOpcode())) + return 0; + + // Determine the alignment of the load. + unsigned Alignment = 0; + if (LoadMI->hasOneMemOperand()) + Alignment = (*LoadMI->memoperands_begin())->getAlignment(); + else + switch (LoadMI->getOpcode()) { + case X86::AVX2_SETALLONES: + case X86::AVX_SET0: + Alignment = 32; + break; + case X86::V_SET0: + case X86::V_SETALLONES: + Alignment = 16; + break; + case X86::FsFLD0SD: + Alignment = 8; + break; + case X86::FsFLD0SS: + Alignment = 4; + break; + default: + return 0; + } + if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { + unsigned NewOpc = 0; + switch (MI->getOpcode()) { + default: return NULL; + case X86::TEST8rr: NewOpc = X86::CMP8ri; break; + case X86::TEST16rr: NewOpc = X86::CMP16ri8; break; + case X86::TEST32rr: NewOpc = X86::CMP32ri8; break; + case X86::TEST64rr: NewOpc = X86::CMP64ri8; break; + } + // Change to CMPXXri r, 0 first. + MI->setDesc(get(NewOpc)); + MI->getOperand(1).ChangeToImmediate(0); + } else if (Ops.size() != 1) + return NULL; + + // Make sure the subregisters match. + // Otherwise we risk changing the size of the load. + if (LoadMI->getOperand(0).getSubReg() != MI->getOperand(Ops[0]).getSubReg()) + return NULL; + + SmallVector<MachineOperand,X86::AddrNumOperands> MOs; + switch (LoadMI->getOpcode()) { + case X86::V_SET0: + case X86::V_SETALLONES: + case X86::AVX2_SETALLONES: + case X86::AVX_SET0: + case X86::FsFLD0SD: + case X86::FsFLD0SS: { + // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure. + // Create a constant-pool entry and operands to load from it. + + // Medium and large mode can't fold loads this way. + if (TM.getCodeModel() != CodeModel::Small && + TM.getCodeModel() != CodeModel::Kernel) + return NULL; + + // x86-32 PIC requires a PIC base register for constant pools. + unsigned PICBase = 0; + if (TM.getRelocationModel() == Reloc::PIC_) { + if (TM.getSubtarget<X86Subtarget>().is64Bit()) + PICBase = X86::RIP; + else + // FIXME: PICBase = getGlobalBaseReg(&MF); + // This doesn't work for several reasons. + // 1. GlobalBaseReg may have been spilled. + // 2. It may not be live at MI. + return NULL; + } + + // Create a constant-pool entry. + MachineConstantPool &MCP = *MF.getConstantPool(); + Type *Ty; + unsigned Opc = LoadMI->getOpcode(); + if (Opc == X86::FsFLD0SS) + Ty = Type::getFloatTy(MF.getFunction()->getContext()); + else if (Opc == X86::FsFLD0SD) + Ty = Type::getDoubleTy(MF.getFunction()->getContext()); + else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0) + Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8); + else + Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4); + + bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES); + const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) : + Constant::getNullValue(Ty); + unsigned CPI = MCP.getConstantPoolIndex(C, Alignment); + + // Create operands to load from the constant pool entry. + MOs.push_back(MachineOperand::CreateReg(PICBase, false)); + MOs.push_back(MachineOperand::CreateImm(1)); + MOs.push_back(MachineOperand::CreateReg(0, false)); + MOs.push_back(MachineOperand::CreateCPI(CPI, 0)); + MOs.push_back(MachineOperand::CreateReg(0, false)); + break; + } + default: { + if ((LoadMI->getOpcode() == X86::MOVSSrm || + LoadMI->getOpcode() == X86::VMOVSSrm) && + MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize() + > 4) + // These instructions only load 32 bits, we can't fold them if the + // destination register is wider than 32 bits (4 bytes). + return NULL; + if ((LoadMI->getOpcode() == X86::MOVSDrm || + LoadMI->getOpcode() == X86::VMOVSDrm) && + MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize() + > 8) + // These instructions only load 64 bits, we can't fold them if the + // destination register is wider than 64 bits (8 bytes). + return NULL; + + // Folding a normal load. Just copy the load's address operands. + unsigned NumOps = LoadMI->getDesc().getNumOperands(); + for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) + MOs.push_back(LoadMI->getOperand(i)); + break; + } + } + return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, 0, Alignment); +} + + +bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops) const { + // Check switch flag + if (NoFusing) return 0; + + if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { + switch (MI->getOpcode()) { + default: return false; + case X86::TEST8rr: + case X86::TEST16rr: + case X86::TEST32rr: + case X86::TEST64rr: + return true; + case X86::ADD32ri: + // FIXME: AsmPrinter doesn't know how to handle + // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. + if (MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) + return false; + break; + } + } + + if (Ops.size() != 1) + return false; + + unsigned OpNum = Ops[0]; + unsigned Opc = MI->getOpcode(); + unsigned NumOps = MI->getDesc().getNumOperands(); + bool isTwoAddr = NumOps > 1 && + MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1; + + // Folding a memory location into the two-address part of a two-address + // instruction is different than folding it other places. It requires + // replacing the *two* registers with the memory location. + const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = 0; + if (isTwoAddr && NumOps >= 2 && OpNum < 2) { + OpcodeTablePtr = &RegOp2MemOpTable2Addr; + } else if (OpNum == 0) { // If operand 0 + switch (Opc) { + case X86::MOV8r0: + case X86::MOV16r0: + case X86::MOV32r0: + case X86::MOV64r0: return true; + default: break; + } + OpcodeTablePtr = &RegOp2MemOpTable0; + } else if (OpNum == 1) { + OpcodeTablePtr = &RegOp2MemOpTable1; + } else if (OpNum == 2) { + OpcodeTablePtr = &RegOp2MemOpTable2; + } else if (OpNum == 3) { + OpcodeTablePtr = &RegOp2MemOpTable3; + } + + if (OpcodeTablePtr && OpcodeTablePtr->count(Opc)) + return true; + return TargetInstrInfo::canFoldMemoryOperand(MI, Ops); +} + +bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, + unsigned Reg, bool UnfoldLoad, bool UnfoldStore, + SmallVectorImpl<MachineInstr*> &NewMIs) const { + DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I = + MemOp2RegOpTable.find(MI->getOpcode()); + if (I == MemOp2RegOpTable.end()) + return false; + unsigned Opc = I->second.first; + unsigned Index = I->second.second & TB_INDEX_MASK; + bool FoldedLoad = I->second.second & TB_FOLDED_LOAD; + bool FoldedStore = I->second.second & TB_FOLDED_STORE; + if (UnfoldLoad && !FoldedLoad) + return false; + UnfoldLoad &= FoldedLoad; + if (UnfoldStore && !FoldedStore) + return false; + UnfoldStore &= FoldedStore; + + const MCInstrDesc &MCID = get(Opc); + const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); + if (!MI->hasOneMemOperand() && + RC == &X86::VR128RegClass && + !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast()) + // Without memoperands, loadRegFromAddr and storeRegToStackSlot will + // conservatively assume the address is unaligned. That's bad for + // performance. + return false; + SmallVector<MachineOperand, X86::AddrNumOperands> AddrOps; + SmallVector<MachineOperand,2> BeforeOps; + SmallVector<MachineOperand,2> AfterOps; + SmallVector<MachineOperand,4> ImpOps; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &Op = MI->getOperand(i); + if (i >= Index && i < Index + X86::AddrNumOperands) + AddrOps.push_back(Op); + else if (Op.isReg() && Op.isImplicit()) + ImpOps.push_back(Op); + else if (i < Index) + BeforeOps.push_back(Op); + else if (i > Index) + AfterOps.push_back(Op); + } + + // Emit the load instruction. + if (UnfoldLoad) { + std::pair<MachineInstr::mmo_iterator, + MachineInstr::mmo_iterator> MMOs = + MF.extractLoadMemRefs(MI->memoperands_begin(), + MI->memoperands_end()); + loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs.first, MMOs.second, NewMIs); + if (UnfoldStore) { + // Address operands cannot be marked isKill. + for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) { + MachineOperand &MO = NewMIs[0]->getOperand(i); + if (MO.isReg()) + MO.setIsKill(false); + } + } + } + + // Emit the data processing instruction. + MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI->getDebugLoc(), true); + MachineInstrBuilder MIB(MF, DataMI); + + if (FoldedStore) + MIB.addReg(Reg, RegState::Define); + for (unsigned i = 0, e = BeforeOps.size(); i != e; ++i) + MIB.addOperand(BeforeOps[i]); + if (FoldedLoad) + MIB.addReg(Reg); + for (unsigned i = 0, e = AfterOps.size(); i != e; ++i) + MIB.addOperand(AfterOps[i]); + for (unsigned i = 0, e = ImpOps.size(); i != e; ++i) { + MachineOperand &MO = ImpOps[i]; + MIB.addReg(MO.getReg(), + getDefRegState(MO.isDef()) | + RegState::Implicit | + getKillRegState(MO.isKill()) | + getDeadRegState(MO.isDead()) | + getUndefRegState(MO.isUndef())); + } + // Change CMP32ri r, 0 back to TEST32rr r, r, etc. + switch (DataMI->getOpcode()) { + default: break; + case X86::CMP64ri32: + case X86::CMP64ri8: + case X86::CMP32ri: + case X86::CMP32ri8: + case X86::CMP16ri: + case X86::CMP16ri8: + case X86::CMP8ri: { + MachineOperand &MO0 = DataMI->getOperand(0); + MachineOperand &MO1 = DataMI->getOperand(1); + if (MO1.getImm() == 0) { + unsigned NewOpc; + switch (DataMI->getOpcode()) { + default: llvm_unreachable("Unreachable!"); + case X86::CMP64ri8: + case X86::CMP64ri32: NewOpc = X86::TEST64rr; break; + case X86::CMP32ri8: + case X86::CMP32ri: NewOpc = X86::TEST32rr; break; + case X86::CMP16ri8: + case X86::CMP16ri: NewOpc = X86::TEST16rr; break; + case X86::CMP8ri: NewOpc = X86::TEST8rr; break; + } + DataMI->setDesc(get(NewOpc)); + MO1.ChangeToRegister(MO0.getReg(), false); + } + } + } + NewMIs.push_back(DataMI); + + // Emit the store instruction. + if (UnfoldStore) { + const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF); + std::pair<MachineInstr::mmo_iterator, + MachineInstr::mmo_iterator> MMOs = + MF.extractStoreMemRefs(MI->memoperands_begin(), + MI->memoperands_end()); + storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs.first, MMOs.second, NewMIs); + } + + return true; +} + +bool +X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, + SmallVectorImpl<SDNode*> &NewNodes) const { + if (!N->isMachineOpcode()) + return false; + + DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I = + MemOp2RegOpTable.find(N->getMachineOpcode()); + if (I == MemOp2RegOpTable.end()) + return false; + unsigned Opc = I->second.first; + unsigned Index = I->second.second & TB_INDEX_MASK; + bool FoldedLoad = I->second.second & TB_FOLDED_LOAD; + bool FoldedStore = I->second.second & TB_FOLDED_STORE; + const MCInstrDesc &MCID = get(Opc); + MachineFunction &MF = DAG.getMachineFunction(); + const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); + unsigned NumDefs = MCID.NumDefs; + std::vector<SDValue> AddrOps; + std::vector<SDValue> BeforeOps; + std::vector<SDValue> AfterOps; + DebugLoc dl = N->getDebugLoc(); + unsigned NumOps = N->getNumOperands(); + for (unsigned i = 0; i != NumOps-1; ++i) { + SDValue Op = N->getOperand(i); + if (i >= Index-NumDefs && i < Index-NumDefs + X86::AddrNumOperands) + AddrOps.push_back(Op); + else if (i < Index-NumDefs) + BeforeOps.push_back(Op); + else if (i > Index-NumDefs) + AfterOps.push_back(Op); + } + SDValue Chain = N->getOperand(NumOps-1); + AddrOps.push_back(Chain); + + // Emit the load instruction. + SDNode *Load = 0; + if (FoldedLoad) { + EVT VT = *RC->vt_begin(); + std::pair<MachineInstr::mmo_iterator, + MachineInstr::mmo_iterator> MMOs = + MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(), + cast<MachineSDNode>(N)->memoperands_end()); + if (!(*MMOs.first) && + RC == &X86::VR128RegClass && + !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast()) + // Do not introduce a slow unaligned load. + return false; + unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + bool isAligned = (*MMOs.first) && + (*MMOs.first)->getAlignment() >= Alignment; + Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, TM), dl, + VT, MVT::Other, AddrOps); + NewNodes.push_back(Load); + + // Preserve memory reference information. + cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second); + } + + // Emit the data processing instruction. + std::vector<EVT> VTs; + const TargetRegisterClass *DstRC = 0; + if (MCID.getNumDefs() > 0) { + DstRC = getRegClass(MCID, 0, &RI, MF); + VTs.push_back(*DstRC->vt_begin()); + } + for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) { + EVT VT = N->getValueType(i); + if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs()) + VTs.push_back(VT); + } + if (Load) + BeforeOps.push_back(SDValue(Load, 0)); + std::copy(AfterOps.begin(), AfterOps.end(), std::back_inserter(BeforeOps)); + SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps); + NewNodes.push_back(NewNode); + + // Emit the store instruction. + if (FoldedStore) { + AddrOps.pop_back(); + AddrOps.push_back(SDValue(NewNode, 0)); + AddrOps.push_back(Chain); + std::pair<MachineInstr::mmo_iterator, + MachineInstr::mmo_iterator> MMOs = + MF.extractStoreMemRefs(cast<MachineSDNode>(N)->memoperands_begin(), + cast<MachineSDNode>(N)->memoperands_end()); + if (!(*MMOs.first) && + RC == &X86::VR128RegClass && + !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast()) + // Do not introduce a slow unaligned store. + return false; + unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + bool isAligned = (*MMOs.first) && + (*MMOs.first)->getAlignment() >= Alignment; + SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC, + isAligned, TM), + dl, MVT::Other, AddrOps); + NewNodes.push_back(Store); + + // Preserve memory reference information. + cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second); + } + + return true; +} + +unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, + bool UnfoldLoad, bool UnfoldStore, + unsigned *LoadRegIndex) const { + DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I = + MemOp2RegOpTable.find(Opc); + if (I == MemOp2RegOpTable.end()) + return 0; + bool FoldedLoad = I->second.second & TB_FOLDED_LOAD; + bool FoldedStore = I->second.second & TB_FOLDED_STORE; + if (UnfoldLoad && !FoldedLoad) + return 0; + if (UnfoldStore && !FoldedStore) + return 0; + if (LoadRegIndex) + *LoadRegIndex = I->second.second & TB_INDEX_MASK; + return I->second.first; +} + +bool +X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, + int64_t &Offset1, int64_t &Offset2) const { + if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode()) + return false; + unsigned Opc1 = Load1->getMachineOpcode(); + unsigned Opc2 = Load2->getMachineOpcode(); + switch (Opc1) { + default: return false; + case X86::MOV8rm: + case X86::MOV16rm: + case X86::MOV32rm: + case X86::MOV64rm: + case X86::LD_Fp32m: + case X86::LD_Fp64m: + case X86::LD_Fp80m: + case X86::MOVSSrm: + case X86::MOVSDrm: + case X86::MMX_MOVD64rm: + case X86::MMX_MOVQ64rm: + case X86::FsMOVAPSrm: + case X86::FsMOVAPDrm: + case X86::MOVAPSrm: + case X86::MOVUPSrm: + case X86::MOVAPDrm: + case X86::MOVDQArm: + case X86::MOVDQUrm: + // AVX load instructions + case X86::VMOVSSrm: + case X86::VMOVSDrm: + case X86::FsVMOVAPSrm: + case X86::FsVMOVAPDrm: + case X86::VMOVAPSrm: + case X86::VMOVUPSrm: + case X86::VMOVAPDrm: + case X86::VMOVDQArm: + case X86::VMOVDQUrm: + case X86::VMOVAPSYrm: + case X86::VMOVUPSYrm: + case X86::VMOVAPDYrm: + case X86::VMOVDQAYrm: + case X86::VMOVDQUYrm: + break; + } + switch (Opc2) { + default: return false; + case X86::MOV8rm: + case X86::MOV16rm: + case X86::MOV32rm: + case X86::MOV64rm: + case X86::LD_Fp32m: + case X86::LD_Fp64m: + case X86::LD_Fp80m: + case X86::MOVSSrm: + case X86::MOVSDrm: + case X86::MMX_MOVD64rm: + case X86::MMX_MOVQ64rm: + case X86::FsMOVAPSrm: + case X86::FsMOVAPDrm: + case X86::MOVAPSrm: + case X86::MOVUPSrm: + case X86::MOVAPDrm: + case X86::MOVDQArm: + case X86::MOVDQUrm: + // AVX load instructions + case X86::VMOVSSrm: + case X86::VMOVSDrm: + case X86::FsVMOVAPSrm: + case X86::FsVMOVAPDrm: + case X86::VMOVAPSrm: + case X86::VMOVUPSrm: + case X86::VMOVAPDrm: + case X86::VMOVDQArm: + case X86::VMOVDQUrm: + case X86::VMOVAPSYrm: + case X86::VMOVUPSYrm: + case X86::VMOVAPDYrm: + case X86::VMOVDQAYrm: + case X86::VMOVDQUYrm: + break; + } + + // Check if chain operands and base addresses match. + if (Load1->getOperand(0) != Load2->getOperand(0) || + Load1->getOperand(5) != Load2->getOperand(5)) + return false; + // Segment operands should match as well. + if (Load1->getOperand(4) != Load2->getOperand(4)) + return false; + // Scale should be 1, Index should be Reg0. + if (Load1->getOperand(1) == Load2->getOperand(1) && + Load1->getOperand(2) == Load2->getOperand(2)) { + if (cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue() != 1) + return false; + + // Now let's examine the displacements. + if (isa<ConstantSDNode>(Load1->getOperand(3)) && + isa<ConstantSDNode>(Load2->getOperand(3))) { + Offset1 = cast<ConstantSDNode>(Load1->getOperand(3))->getSExtValue(); + Offset2 = cast<ConstantSDNode>(Load2->getOperand(3))->getSExtValue(); + return true; + } + } + return false; +} + +bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const { + assert(Offset2 > Offset1); + if ((Offset2 - Offset1) / 8 > 64) + return false; + + unsigned Opc1 = Load1->getMachineOpcode(); + unsigned Opc2 = Load2->getMachineOpcode(); + if (Opc1 != Opc2) + return false; // FIXME: overly conservative? + + switch (Opc1) { + default: break; + case X86::LD_Fp32m: + case X86::LD_Fp64m: + case X86::LD_Fp80m: + case X86::MMX_MOVD64rm: + case X86::MMX_MOVQ64rm: + return false; + } + + EVT VT = Load1->getValueType(0); + switch (VT.getSimpleVT().SimpleTy) { + default: + // XMM registers. In 64-bit mode we can be a bit more aggressive since we + // have 16 of them to play with. + if (TM.getSubtargetImpl()->is64Bit()) { + if (NumLoads >= 3) + return false; + } else if (NumLoads) { + return false; + } + break; + case MVT::i8: + case MVT::i16: + case MVT::i32: + case MVT::i64: + case MVT::f32: + case MVT::f64: + if (NumLoads) + return false; + break; + } + + return true; +} + + +bool X86InstrInfo:: +ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { + assert(Cond.size() == 1 && "Invalid X86 branch condition!"); + X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm()); + if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E) + return true; + Cond[0].setImm(GetOppositeBranchCondition(CC)); + return false; +} + +bool X86InstrInfo:: +isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { + // FIXME: Return false for x87 stack register classes for now. We can't + // allow any loads of these registers before FpGet_ST0_80. + return !(RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass || + RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass); +} + +/// getGlobalBaseReg - Return a virtual register initialized with the +/// the global base register value. Output instructions required to +/// initialize the register in the function entry block, if necessary. +/// +/// TODO: Eliminate this and move the code to X86MachineFunctionInfo. +/// +unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const { + assert(!TM.getSubtarget<X86Subtarget>().is64Bit() && + "X86-64 PIC uses RIP relative addressing"); + + X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>(); + unsigned GlobalBaseReg = X86FI->getGlobalBaseReg(); + if (GlobalBaseReg != 0) + return GlobalBaseReg; + + // Create the register. The code to initialize it is inserted + // later, by the CGBR pass (below). + MachineRegisterInfo &RegInfo = MF->getRegInfo(); + GlobalBaseReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); + X86FI->setGlobalBaseReg(GlobalBaseReg); + return GlobalBaseReg; +} + +// These are the replaceable SSE instructions. Some of these have Int variants +// that we don't include here. We don't want to replace instructions selected +// by intrinsics. +static const uint16_t ReplaceableInstrs[][3] = { + //PackedSingle PackedDouble PackedInt + { X86::MOVAPSmr, X86::MOVAPDmr, X86::MOVDQAmr }, + { X86::MOVAPSrm, X86::MOVAPDrm, X86::MOVDQArm }, + { X86::MOVAPSrr, X86::MOVAPDrr, X86::MOVDQArr }, + { X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr }, + { X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm }, + { X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr }, + { X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm }, + { X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr }, + { X86::ANDPSrm, X86::ANDPDrm, X86::PANDrm }, + { X86::ANDPSrr, X86::ANDPDrr, X86::PANDrr }, + { X86::ORPSrm, X86::ORPDrm, X86::PORrm }, + { X86::ORPSrr, X86::ORPDrr, X86::PORrr }, + { X86::XORPSrm, X86::XORPDrm, X86::PXORrm }, + { X86::XORPSrr, X86::XORPDrr, X86::PXORrr }, + // AVX 128-bit support + { X86::VMOVAPSmr, X86::VMOVAPDmr, X86::VMOVDQAmr }, + { X86::VMOVAPSrm, X86::VMOVAPDrm, X86::VMOVDQArm }, + { X86::VMOVAPSrr, X86::VMOVAPDrr, X86::VMOVDQArr }, + { X86::VMOVUPSmr, X86::VMOVUPDmr, X86::VMOVDQUmr }, + { X86::VMOVUPSrm, X86::VMOVUPDrm, X86::VMOVDQUrm }, + { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr }, + { X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm }, + { X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr }, + { X86::VANDPSrm, X86::VANDPDrm, X86::VPANDrm }, + { X86::VANDPSrr, X86::VANDPDrr, X86::VPANDrr }, + { X86::VORPSrm, X86::VORPDrm, X86::VPORrm }, + { X86::VORPSrr, X86::VORPDrr, X86::VPORrr }, + { X86::VXORPSrm, X86::VXORPDrm, X86::VPXORrm }, + { X86::VXORPSrr, X86::VXORPDrr, X86::VPXORrr }, + // AVX 256-bit support + { X86::VMOVAPSYmr, X86::VMOVAPDYmr, X86::VMOVDQAYmr }, + { X86::VMOVAPSYrm, X86::VMOVAPDYrm, X86::VMOVDQAYrm }, + { X86::VMOVAPSYrr, X86::VMOVAPDYrr, X86::VMOVDQAYrr }, + { X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr }, + { X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm }, + { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr } +}; + +static const uint16_t ReplaceableInstrsAVX2[][3] = { + //PackedSingle PackedDouble PackedInt + { X86::VANDNPSYrm, X86::VANDNPDYrm, X86::VPANDNYrm }, + { X86::VANDNPSYrr, X86::VANDNPDYrr, X86::VPANDNYrr }, + { X86::VANDPSYrm, X86::VANDPDYrm, X86::VPANDYrm }, + { X86::VANDPSYrr, X86::VANDPDYrr, X86::VPANDYrr }, + { X86::VORPSYrm, X86::VORPDYrm, X86::VPORYrm }, + { X86::VORPSYrr, X86::VORPDYrr, X86::VPORYrr }, + { X86::VXORPSYrm, X86::VXORPDYrm, X86::VPXORYrm }, + { X86::VXORPSYrr, X86::VXORPDYrr, X86::VPXORYrr }, + { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr }, + { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr }, + { X86::VINSERTF128rm, X86::VINSERTF128rm, X86::VINSERTI128rm }, + { X86::VINSERTF128rr, X86::VINSERTF128rr, X86::VINSERTI128rr }, + { X86::VPERM2F128rm, X86::VPERM2F128rm, X86::VPERM2I128rm }, + { X86::VPERM2F128rr, X86::VPERM2F128rr, X86::VPERM2I128rr } +}; + +// FIXME: Some shuffle and unpack instructions have equivalents in different +// domains, but they require a bit more work than just switching opcodes. + +static const uint16_t *lookup(unsigned opcode, unsigned domain) { + for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i) + if (ReplaceableInstrs[i][domain-1] == opcode) + return ReplaceableInstrs[i]; + return 0; +} + +static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) { + for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i) + if (ReplaceableInstrsAVX2[i][domain-1] == opcode) + return ReplaceableInstrsAVX2[i]; + return 0; +} + +std::pair<uint16_t, uint16_t> +X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const { + uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3; + bool hasAVX2 = TM.getSubtarget<X86Subtarget>().hasAVX2(); + uint16_t validDomains = 0; + if (domain && lookup(MI->getOpcode(), domain)) + validDomains = 0xe; + else if (domain && lookupAVX2(MI->getOpcode(), domain)) + validDomains = hasAVX2 ? 0xe : 0x6; + return std::make_pair(domain, validDomains); +} + +void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { + assert(Domain>0 && Domain<4 && "Invalid execution domain"); + uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3; + assert(dom && "Not an SSE instruction"); + const uint16_t *table = lookup(MI->getOpcode(), dom); + if (!table) { // try the other table + assert((TM.getSubtarget<X86Subtarget>().hasAVX2() || Domain < 3) && + "256-bit vector operations only available in AVX2"); + table = lookupAVX2(MI->getOpcode(), dom); + } + assert(table && "Cannot change domain"); + MI->setDesc(get(table[Domain-1])); +} + +/// getNoopForMachoTarget - Return the noop instruction to use for a noop. +void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { + NopInst.setOpcode(X86::NOOP); +} + +bool X86InstrInfo::isHighLatencyDef(int opc) const { + switch (opc) { + default: return false; + case X86::DIVSDrm: + case X86::DIVSDrm_Int: + case X86::DIVSDrr: + case X86::DIVSDrr_Int: + case X86::DIVSSrm: + case X86::DIVSSrm_Int: + case X86::DIVSSrr: + case X86::DIVSSrr_Int: + case X86::SQRTPDm: + case X86::SQRTPDr: + case X86::SQRTPSm: + case X86::SQRTPSr: + case X86::SQRTSDm: + case X86::SQRTSDm_Int: + case X86::SQRTSDr: + case X86::SQRTSDr_Int: + case X86::SQRTSSm: + case X86::SQRTSSm_Int: + case X86::SQRTSSr: + case X86::SQRTSSr_Int: + // AVX instructions with high latency + case X86::VDIVSDrm: + case X86::VDIVSDrm_Int: + case X86::VDIVSDrr: + case X86::VDIVSDrr_Int: + case X86::VDIVSSrm: + case X86::VDIVSSrm_Int: + case X86::VDIVSSrr: + case X86::VDIVSSrr_Int: + case X86::VSQRTPDm: + case X86::VSQRTPDr: + case X86::VSQRTPSm: + case X86::VSQRTPSr: + case X86::VSQRTSDm: + case X86::VSQRTSDm_Int: + case X86::VSQRTSDr: + case X86::VSQRTSSm: + case X86::VSQRTSSm_Int: + case X86::VSQRTSSr: + return true; + } +} + +bool X86InstrInfo:: +hasHighOperandLatency(const InstrItineraryData *ItinData, + const MachineRegisterInfo *MRI, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const { + return isHighLatencyDef(DefMI->getOpcode()); +} + +namespace { + /// CGBR - Create Global Base Reg pass. This initializes the PIC + /// global base register for x86-32. + struct CGBR : public MachineFunctionPass { + static char ID; + CGBR() : MachineFunctionPass(ID) {} + + virtual bool runOnMachineFunction(MachineFunction &MF) { + const X86TargetMachine *TM = + static_cast<const X86TargetMachine *>(&MF.getTarget()); + + assert(!TM->getSubtarget<X86Subtarget>().is64Bit() && + "X86-64 PIC uses RIP relative addressing"); + + // Only emit a global base reg in PIC mode. + if (TM->getRelocationModel() != Reloc::PIC_) + return false; + + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + unsigned GlobalBaseReg = X86FI->getGlobalBaseReg(); + + // If we didn't need a GlobalBaseReg, don't insert code. + if (GlobalBaseReg == 0) + return false; + + // Insert the set of GlobalBaseReg into the first MBB of the function + MachineBasicBlock &FirstMBB = MF.front(); + MachineBasicBlock::iterator MBBI = FirstMBB.begin(); + DebugLoc DL = FirstMBB.findDebugLoc(MBBI); + MachineRegisterInfo &RegInfo = MF.getRegInfo(); + const X86InstrInfo *TII = TM->getInstrInfo(); + + unsigned PC; + if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT()) + PC = RegInfo.createVirtualRegister(&X86::GR32RegClass); + else + PC = GlobalBaseReg; + + // Operand of MovePCtoStack is completely ignored by asm printer. It's + // only used in JIT code emission as displacement to pc. + BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0); + + // If we're using vanilla 'GOT' PIC style, we should use relative addressing + // not to pc, but to _GLOBAL_OFFSET_TABLE_ external. + if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT()) { + // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register + BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg) + .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_", + X86II::MO_GOT_ABSOLUTE_ADDRESS); + } + + return true; + } + + virtual const char *getPassName() const { + return "X86 PIC Global Base Reg Initialization"; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + }; +} + +char CGBR::ID = 0; +FunctionPass* +llvm::createGlobalBaseRegPass() { return new CGBR(); } + +namespace { + struct LDTLSCleanup : public MachineFunctionPass { + static char ID; + LDTLSCleanup() : MachineFunctionPass(ID) {} + + virtual bool runOnMachineFunction(MachineFunction &MF) { + X86MachineFunctionInfo* MFI = MF.getInfo<X86MachineFunctionInfo>(); + if (MFI->getNumLocalDynamicTLSAccesses() < 2) { + // No point folding accesses if there isn't at least two. + return false; + } + + MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>(); + return VisitNode(DT->getRootNode(), 0); + } + + // Visit the dominator subtree rooted at Node in pre-order. + // If TLSBaseAddrReg is non-null, then use that to replace any + // TLS_base_addr instructions. Otherwise, create the register + // when the first such instruction is seen, and then use it + // as we encounter more instructions. + bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) { + MachineBasicBlock *BB = Node->getBlock(); + bool Changed = false; + + // Traverse the current block. + for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; + ++I) { + switch (I->getOpcode()) { + case X86::TLS_base_addr32: + case X86::TLS_base_addr64: + if (TLSBaseAddrReg) + I = ReplaceTLSBaseAddrCall(I, TLSBaseAddrReg); + else + I = SetRegister(I, &TLSBaseAddrReg); + Changed = true; + break; + default: + break; + } + } + + // Visit the children of this block in the dominator tree. + for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end(); + I != E; ++I) { + Changed |= VisitNode(*I, TLSBaseAddrReg); + } + + return Changed; + } + + // Replace the TLS_base_addr instruction I with a copy from + // TLSBaseAddrReg, returning the new instruction. + MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I, + unsigned TLSBaseAddrReg) { + MachineFunction *MF = I->getParent()->getParent(); + const X86TargetMachine *TM = + static_cast<const X86TargetMachine *>(&MF->getTarget()); + const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit(); + const X86InstrInfo *TII = TM->getInstrInfo(); + + // Insert a Copy from TLSBaseAddrReg to RAX/EAX. + MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(), + TII->get(TargetOpcode::COPY), + is64Bit ? X86::RAX : X86::EAX) + .addReg(TLSBaseAddrReg); + + // Erase the TLS_base_addr instruction. + I->eraseFromParent(); + + return Copy; + } + + // Create a virtal register in *TLSBaseAddrReg, and populate it by + // inserting a copy instruction after I. Returns the new instruction. + MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) { + MachineFunction *MF = I->getParent()->getParent(); + const X86TargetMachine *TM = + static_cast<const X86TargetMachine *>(&MF->getTarget()); + const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit(); + const X86InstrInfo *TII = TM->getInstrInfo(); + + // Create a virtual register for the TLS base address. + MachineRegisterInfo &RegInfo = MF->getRegInfo(); + *TLSBaseAddrReg = RegInfo.createVirtualRegister(is64Bit + ? &X86::GR64RegClass + : &X86::GR32RegClass); + + // Insert a copy from RAX/EAX to TLSBaseAddrReg. + MachineInstr *Next = I->getNextNode(); + MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(), + TII->get(TargetOpcode::COPY), + *TLSBaseAddrReg) + .addReg(is64Bit ? X86::RAX : X86::EAX); + + return Copy; + } + + virtual const char *getPassName() const { + return "Local Dynamic TLS Access Clean-up"; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<MachineDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + }; +} + +char LDTLSCleanup::ID = 0; +FunctionPass* +llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); } diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h new file mode 100644 index 0000000..260f054 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h @@ -0,0 +1,419 @@ +//===-- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef X86INSTRUCTIONINFO_H +#define X86INSTRUCTIONINFO_H + +#include "X86.h" +#include "X86RegisterInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/Target/TargetInstrInfo.h" + +#define GET_INSTRINFO_HEADER +#include "X86GenInstrInfo.inc" + +namespace llvm { + class X86RegisterInfo; + class X86TargetMachine; + +namespace X86 { + // X86 specific condition code. These correspond to X86_*_COND in + // X86InstrInfo.td. They must be kept in synch. + enum CondCode { + COND_A = 0, + COND_AE = 1, + COND_B = 2, + COND_BE = 3, + COND_E = 4, + COND_G = 5, + COND_GE = 6, + COND_L = 7, + COND_LE = 8, + COND_NE = 9, + COND_NO = 10, + COND_NP = 11, + COND_NS = 12, + COND_O = 13, + COND_P = 14, + COND_S = 15, + + // Artificial condition codes. These are used by AnalyzeBranch + // to indicate a block terminated with two conditional branches to + // the same location. This occurs in code using FCMP_OEQ or FCMP_UNE, + // which can't be represented on x86 with a single condition. These + // are never used in MachineInstrs. + COND_NE_OR_P, + COND_NP_OR_E, + + COND_INVALID + }; + + // Turn condition code into conditional branch opcode. + unsigned GetCondBranchFromCond(CondCode CC); + + // Turn CMov opcode into condition code. + CondCode getCondFromCMovOpc(unsigned Opc); + + /// GetOppositeBranchCondition - Return the inverse of the specified cond, + /// e.g. turning COND_E to COND_NE. + CondCode GetOppositeBranchCondition(X86::CondCode CC); +} // end namespace X86; + + +/// isGlobalStubReference - Return true if the specified TargetFlag operand is +/// a reference to a stub for a global, not the global itself. +inline static bool isGlobalStubReference(unsigned char TargetFlag) { + switch (TargetFlag) { + case X86II::MO_DLLIMPORT: // dllimport stub. + case X86II::MO_GOTPCREL: // rip-relative GOT reference. + case X86II::MO_GOT: // normal GOT reference. + case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Normal $non_lazy_ptr ref. + case X86II::MO_DARWIN_NONLAZY: // Normal $non_lazy_ptr ref. + case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Hidden $non_lazy_ptr ref. + return true; + default: + return false; + } +} + +/// isGlobalRelativeToPICBase - Return true if the specified global value +/// reference is relative to a 32-bit PIC base (X86ISD::GlobalBaseReg). If this +/// is true, the addressing mode has the PIC base register added in (e.g. EBX). +inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) { + switch (TargetFlag) { + case X86II::MO_GOTOFF: // isPICStyleGOT: local global. + case X86II::MO_GOT: // isPICStyleGOT: other global. + case X86II::MO_PIC_BASE_OFFSET: // Darwin local global. + case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Darwin/32 external global. + case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Darwin/32 hidden global. + case X86II::MO_TLVP: // ??? Pretty sure.. + return true; + default: + return false; + } +} + +inline static bool isScale(const MachineOperand &MO) { + return MO.isImm() && + (MO.getImm() == 1 || MO.getImm() == 2 || + MO.getImm() == 4 || MO.getImm() == 8); +} + +inline static bool isLeaMem(const MachineInstr *MI, unsigned Op) { + if (MI->getOperand(Op).isFI()) return true; + return Op+4 <= MI->getNumOperands() && + MI->getOperand(Op ).isReg() && isScale(MI->getOperand(Op+1)) && + MI->getOperand(Op+2).isReg() && + (MI->getOperand(Op+3).isImm() || + MI->getOperand(Op+3).isGlobal() || + MI->getOperand(Op+3).isCPI() || + MI->getOperand(Op+3).isJTI()); +} + +inline static bool isMem(const MachineInstr *MI, unsigned Op) { + if (MI->getOperand(Op).isFI()) return true; + return Op+5 <= MI->getNumOperands() && + MI->getOperand(Op+4).isReg() && + isLeaMem(MI, Op); +} + +class X86InstrInfo : public X86GenInstrInfo { + X86TargetMachine &TM; + const X86RegisterInfo RI; + + /// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1, + /// RegOp2MemOpTable2, RegOp2MemOpTable3 - Load / store folding opcode maps. + /// + typedef DenseMap<unsigned, + std::pair<unsigned, unsigned> > RegOp2MemOpTableType; + RegOp2MemOpTableType RegOp2MemOpTable2Addr; + RegOp2MemOpTableType RegOp2MemOpTable0; + RegOp2MemOpTableType RegOp2MemOpTable1; + RegOp2MemOpTableType RegOp2MemOpTable2; + RegOp2MemOpTableType RegOp2MemOpTable3; + + /// MemOp2RegOpTable - Load / store unfolding opcode map. + /// + typedef DenseMap<unsigned, + std::pair<unsigned, unsigned> > MemOp2RegOpTableType; + MemOp2RegOpTableType MemOp2RegOpTable; + + static void AddTableEntry(RegOp2MemOpTableType &R2MTable, + MemOp2RegOpTableType &M2RTable, + unsigned RegOp, unsigned MemOp, unsigned Flags); + +public: + explicit X86InstrInfo(X86TargetMachine &tm); + + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + /// + virtual const X86RegisterInfo &getRegisterInfo() const { return RI; } + + /// isCoalescableExtInstr - Return true if the instruction is a "coalescable" + /// extension instruction. That is, it's like a copy where it's legal for the + /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns + /// true, then it's expected the pre-extension value is available as a subreg + /// of the result register. This also returns the sub-register index in + /// SubIdx. + virtual bool isCoalescableExtInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned &SubIdx) const; + + unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const; + /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination + /// stack locations as well. This uses a heuristic so it isn't + /// reliable for correctness. + unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const; + + unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const; + /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination + /// stack locations as well. This uses a heuristic so it isn't + /// reliable for correctness. + unsigned isStoreToStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const; + + bool isReallyTriviallyReMaterializable(const MachineInstr *MI, + AliasAnalysis *AA) const; + void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + unsigned DestReg, unsigned SubIdx, + const MachineInstr *Orig, + const TargetRegisterInfo &TRI) const; + + /// convertToThreeAddress - This method must be implemented by targets that + /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target + /// may be able to convert a two-address instruction into a true + /// three-address instruction on demand. This allows the X86 target (for + /// example) to convert ADD and SHL instructions into LEA instructions if they + /// would require register copies due to two-addressness. + /// + /// This method returns a null pointer if the transformation cannot be + /// performed, otherwise it returns the new instruction. + /// + virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const; + + /// commuteInstruction - We have a few instructions that must be hacked on to + /// commute them. + /// + virtual MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const; + + // Branch analysis. + virtual bool isUnpredicatedTerminator(const MachineInstr* MI) const; + virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const; + virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const; + virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const SmallVectorImpl<MachineOperand> &Cond, + DebugLoc DL) const; + virtual bool canInsertSelect(const MachineBasicBlock&, + const SmallVectorImpl<MachineOperand> &Cond, + unsigned, unsigned, int&, int&, int&) const; + virtual void insertSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DstReg, + const SmallVectorImpl<MachineOperand> &Cond, + unsigned TrueReg, unsigned FalseReg) const; + virtual void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const; + virtual void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + + virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill, + SmallVectorImpl<MachineOperand> &Addr, + const TargetRegisterClass *RC, + MachineInstr::mmo_iterator MMOBegin, + MachineInstr::mmo_iterator MMOEnd, + SmallVectorImpl<MachineInstr*> &NewMIs) const; + + virtual void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + + virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg, + SmallVectorImpl<MachineOperand> &Addr, + const TargetRegisterClass *RC, + MachineInstr::mmo_iterator MMOBegin, + MachineInstr::mmo_iterator MMOEnd, + SmallVectorImpl<MachineInstr*> &NewMIs) const; + + virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const; + + virtual + MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, + int FrameIx, uint64_t Offset, + const MDNode *MDPtr, + DebugLoc DL) const; + + /// foldMemoryOperand - If this target supports it, fold a load or store of + /// the specified stack slot into the specified machine instruction for the + /// specified operand(s). If this is possible, the target should perform the + /// folding and return true, otherwise it should return false. If it folds + /// the instruction, it is likely that the MachineInstruction the iterator + /// references has been changed. + virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr* MI, + const SmallVectorImpl<unsigned> &Ops, + int FrameIndex) const; + + /// foldMemoryOperand - Same as the previous version except it allows folding + /// of any load and store from / to any address, not just from a specific + /// stack slot. + virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr* MI, + const SmallVectorImpl<unsigned> &Ops, + MachineInstr* LoadMI) const; + + /// canFoldMemoryOperand - Returns true if the specified load / store is + /// folding is possible. + virtual bool canFoldMemoryOperand(const MachineInstr*, + const SmallVectorImpl<unsigned> &) const; + + /// unfoldMemoryOperand - Separate a single instruction which folded a load or + /// a store or a load and a store into two or more instruction. If this is + /// possible, returns true as well as the new instructions by reference. + virtual bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, + unsigned Reg, bool UnfoldLoad, bool UnfoldStore, + SmallVectorImpl<MachineInstr*> &NewMIs) const; + + virtual bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, + SmallVectorImpl<SDNode*> &NewNodes) const; + + /// getOpcodeAfterMemoryUnfold - Returns the opcode of the would be new + /// instruction after load / store are unfolded from an instruction of the + /// specified opcode. It returns zero if the specified unfolding is not + /// possible. If LoadRegIndex is non-null, it is filled in with the operand + /// index of the operand which will hold the register holding the loaded + /// value. + virtual unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, + bool UnfoldLoad, bool UnfoldStore, + unsigned *LoadRegIndex = 0) const; + + /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler + /// to determine if two loads are loading from the same base address. It + /// should only return true if the base pointers are the same and the + /// only differences between the two addresses are the offset. It also returns + /// the offsets by reference. + virtual bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, + int64_t &Offset1, int64_t &Offset2) const; + + /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to + /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should + /// be scheduled togther. On some targets if two loads are loading from + /// addresses in the same cache line, it's better if they are scheduled + /// together. This function takes two integers that represent the load offsets + /// from the common base address. It returns true if it decides it's desirable + /// to schedule the two loads together. "NumLoads" is the number of loads that + /// have already been scheduled after Load1. + virtual bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const; + + virtual void getNoopForMachoTarget(MCInst &NopInst) const; + + virtual + bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const; + + /// isSafeToMoveRegClassDefs - Return true if it's safe to move a machine + /// instruction that defines the specified register class. + bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const; + + static bool isX86_64ExtendedReg(const MachineOperand &MO) { + if (!MO.isReg()) return false; + return X86II::isX86_64ExtendedReg(MO.getReg()); + } + + /// getGlobalBaseReg - Return a virtual register initialized with the + /// the global base register value. Output instructions required to + /// initialize the register in the function entry block, if necessary. + /// + unsigned getGlobalBaseReg(MachineFunction *MF) const; + + std::pair<uint16_t, uint16_t> + getExecutionDomain(const MachineInstr *MI) const; + + void setExecutionDomain(MachineInstr *MI, unsigned Domain) const; + + unsigned getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum, + const TargetRegisterInfo *TRI) const; + void breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, + const TargetRegisterInfo *TRI) const; + + MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr* MI, + unsigned OpNum, + const SmallVectorImpl<MachineOperand> &MOs, + unsigned Size, unsigned Alignment) const; + + bool isHighLatencyDef(int opc) const; + + bool hasHighOperandLatency(const InstrItineraryData *ItinData, + const MachineRegisterInfo *MRI, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const; + + /// analyzeCompare - For a comparison instruction, return the source registers + /// in SrcReg and SrcReg2 if having two register operands, and the value it + /// compares against in CmpValue. Return true if the comparison instruction + /// can be analyzed. + virtual bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, + unsigned &SrcReg2, + int &CmpMask, int &CmpValue) const; + + /// optimizeCompareInstr - Check if there exists an earlier instruction that + /// operates on the same source operands and sets flags in the same way as + /// Compare; remove Compare if possible. + virtual bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, + unsigned SrcReg2, int CmpMask, int CmpValue, + const MachineRegisterInfo *MRI) const; + + /// optimizeLoadInstr - Try to remove the load by folding it to a register + /// operand at the use. We fold the load instructions if and only if the + /// def and use are in the same BB. We only look at one load and see + /// whether it can be folded into MI. FoldAsLoadDefReg is the virtual register + /// defined by the load we are trying to fold. DefMI returns the machine + /// instruction that defines FoldAsLoadDefReg, and the function returns + /// the machine instruction generated due to folding. + virtual MachineInstr* optimizeLoadInstr(MachineInstr *MI, + const MachineRegisterInfo *MRI, + unsigned &FoldAsLoadDefReg, + MachineInstr *&DefMI) const; + +private: + MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc, + MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const; + + /// isFrameOperand - Return true and the FrameIndex if the specified + /// operand and follow operands form a reference to the stack frame. + bool isFrameOperand(const MachineInstr *MI, unsigned int Op, + int &FrameIndex) const; +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td new file mode 100644 index 0000000..3380d8c --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td @@ -0,0 +1,2197 @@ +//===-- X86InstrInfo.td - Main X86 Instruction Definition --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 instruction set, defining the instructions, and +// properties of the instructions which are needed for code generation, machine +// code emission, and analysis. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// X86 specific DAG Nodes. +// + +def SDTIntShiftDOp: SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisInt<0>, SDTCisInt<3>]>; + +def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>; + +def SDTX86Cmpsd : SDTypeProfile<1, 3, [SDTCisVT<0, f64>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; +def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; + +def SDTX86Cmov : SDTypeProfile<1, 4, + [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, + SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; + +// Unary and binary operator instructions that set EFLAGS as a side-effect. +def SDTUnaryArithWithFlags : SDTypeProfile<2, 1, + [SDTCisInt<0>, SDTCisVT<1, i32>]>; + +def SDTBinaryArithWithFlags : SDTypeProfile<2, 2, + [SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, SDTCisVT<1, i32>]>; + +// SDTBinaryArithWithFlagsInOut - RES1, EFLAGS = op LHS, RHS, EFLAGS +def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, + [SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, + SDTCisVT<1, i32>, + SDTCisVT<4, i32>]>; +// RES1, RES2, FLAGS = op LHS, RHS +def SDT2ResultBinaryArithWithFlags : SDTypeProfile<3, 2, + [SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, SDTCisVT<1, i32>]>; +def SDTX86BrCond : SDTypeProfile<0, 3, + [SDTCisVT<0, OtherVT>, + SDTCisVT<1, i8>, SDTCisVT<2, i32>]>; + +def SDTX86SetCC : SDTypeProfile<1, 2, + [SDTCisVT<0, i8>, + SDTCisVT<1, i8>, SDTCisVT<2, i32>]>; +def SDTX86SetCC_C : SDTypeProfile<1, 2, + [SDTCisInt<0>, + SDTCisVT<1, i8>, SDTCisVT<2, i32>]>; + +def SDTX86sahf : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i8>]>; + +def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>; + +def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>, + SDTCisVT<2, i8>]>; +def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; + +def SDTX86atomicBinary : SDTypeProfile<2, 3, [SDTCisInt<0>, SDTCisInt<1>, + SDTCisPtrTy<2>, SDTCisInt<3>,SDTCisInt<4>]>; +def SDTX86Ret : SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>; + +def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>; +def SDT_X86CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, + SDTCisVT<1, i32>]>; + +def SDT_X86Call : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>; + +def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>, + SDTCisVT<1, iPTR>, + SDTCisVT<2, iPTR>]>; + +def SDT_X86VAARG_64 : SDTypeProfile<1, -1, [SDTCisPtrTy<0>, + SDTCisPtrTy<1>, + SDTCisVT<2, i32>, + SDTCisVT<3, i8>, + SDTCisVT<4, i32>]>; + +def SDTX86RepStr : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>; + +def SDTX86Void : SDTypeProfile<0, 0, []>; + +def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; + +def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def SDT_X86TLSBASEADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; + +def SDT_X86WIN_FTOL : SDTypeProfile<0, 1, [SDTCisFP<0>]>; + +def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; + +def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>; + +def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER, + [SDNPHasChain,SDNPSideEffect]>; +def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER, + [SDNPHasChain]>; +def X86SFence : SDNode<"X86ISD::SFENCE", SDT_X86MEMBARRIER, + [SDNPHasChain]>; +def X86LFence : SDNode<"X86ISD::LFENCE", SDT_X86MEMBARRIER, + [SDNPHasChain]>; + + +def X86bsf : SDNode<"X86ISD::BSF", SDTUnaryArithWithFlags>; +def X86bsr : SDNode<"X86ISD::BSR", SDTUnaryArithWithFlags>; +def X86shld : SDNode<"X86ISD::SHLD", SDTIntShiftDOp>; +def X86shrd : SDNode<"X86ISD::SHRD", SDTIntShiftDOp>; + +def X86cmp : SDNode<"X86ISD::CMP" , SDTX86CmpTest>; +def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>; + +def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>; +def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond, + [SDNPHasChain]>; +def X86setcc : SDNode<"X86ISD::SETCC", SDTX86SetCC>; +def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>; + +def X86sahf : SDNode<"X86ISD::SAHF", SDTX86sahf>; + +def X86rdrand : SDNode<"X86ISD::RDRAND", SDTX86rdrand, + [SDNPHasChain, SDNPSideEffect]>; + +def X86rdseed : SDNode<"X86ISD::RDSEED", SDTX86rdrand, + [SDNPHasChain, SDNPSideEffect]>; + +def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; + +def X86AtomAdd64 : SDNode<"X86ISD::ATOMADD64_DAG", SDTX86atomicBinary, + [SDNPHasChain, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86AtomSub64 : SDNode<"X86ISD::ATOMSUB64_DAG", SDTX86atomicBinary, + [SDNPHasChain, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86AtomOr64 : SDNode<"X86ISD::ATOMOR64_DAG", SDTX86atomicBinary, + [SDNPHasChain, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86AtomXor64 : SDNode<"X86ISD::ATOMXOR64_DAG", SDTX86atomicBinary, + [SDNPHasChain, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86AtomAnd64 : SDNode<"X86ISD::ATOMAND64_DAG", SDTX86atomicBinary, + [SDNPHasChain, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86AtomNand64 : SDNode<"X86ISD::ATOMNAND64_DAG", SDTX86atomicBinary, + [SDNPHasChain, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86AtomSwap64 : SDNode<"X86ISD::ATOMSWAP64_DAG", SDTX86atomicBinary, + [SDNPHasChain, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; + +def X86vastart_save_xmm_regs : + SDNode<"X86ISD::VASTART_SAVE_XMM_REGS", + SDT_X86VASTART_SAVE_XMM_REGS, + [SDNPHasChain, SDNPVariadic]>; +def X86vaarg64 : + SDNode<"X86ISD::VAARG_64", SDT_X86VAARG_64, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, + SDNPMemOperand]>; +def X86callseq_start : + SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart, + [SDNPHasChain, SDNPOutGlue]>; +def X86callseq_end : + SDNode<"ISD::CALLSEQ_END", SDT_X86CallSeqEnd, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def X86call : SDNode<"X86ISD::CALL", SDT_X86Call, + [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, + SDNPVariadic]>; + +def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore]>; +def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, + SDNPMayLoad]>; + +def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void, + [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; + +def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>; +def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>; + +def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET, + [SDNPHasChain]>; + +def X86eh_sjlj_setjmp : SDNode<"X86ISD::EH_SJLJ_SETJMP", + SDTypeProfile<1, 1, [SDTCisInt<0>, + SDTCisPtrTy<1>]>, + [SDNPHasChain, SDNPSideEffect]>; +def X86eh_sjlj_longjmp : SDNode<"X86ISD::EH_SJLJ_LONGJMP", + SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPSideEffect]>; + +def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; + +def X86add_flag : SDNode<"X86ISD::ADD", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def X86sub_flag : SDNode<"X86ISD::SUB", SDTBinaryArithWithFlags>; +def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def X86umul_flag : SDNode<"X86ISD::UMUL", SDT2ResultBinaryArithWithFlags, + [SDNPCommutative]>; +def X86adc_flag : SDNode<"X86ISD::ADC", SDTBinaryArithWithFlagsInOut>; +def X86sbb_flag : SDNode<"X86ISD::SBB", SDTBinaryArithWithFlagsInOut>; + +def X86inc_flag : SDNode<"X86ISD::INC", SDTUnaryArithWithFlags>; +def X86dec_flag : SDNode<"X86ISD::DEC", SDTUnaryArithWithFlags>; +def X86or_flag : SDNode<"X86ISD::OR", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def X86xor_flag : SDNode<"X86ISD::XOR", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def X86and_flag : SDNode<"X86ISD::AND", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def X86andn_flag : SDNode<"X86ISD::ANDN", SDTBinaryArithWithFlags>; + +def X86blsi : SDNode<"X86ISD::BLSI", SDTIntUnaryOp>; +def X86blsmsk : SDNode<"X86ISD::BLSMSK", SDTIntUnaryOp>; +def X86blsr : SDNode<"X86ISD::BLSR", SDTIntUnaryOp>; + +def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>; + +def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDTX86Void, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>; + +def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA, + [SDNPHasChain]>; + +def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def X86WinFTOL : SDNode<"X86ISD::WIN_FTOL", SDT_X86WIN_FTOL, + [SDNPHasChain, SDNPOutGlue]>; + +//===----------------------------------------------------------------------===// +// X86 Operand Definitions. +// + +// A version of ptr_rc which excludes SP, ESP, and RSP. This is used for +// the index operand of an address, to conform to x86 encoding restrictions. +def ptr_rc_nosp : PointerLikeRegClass<1>; + +// *mem - Operand definitions for the funky X86 addressing mode operands. +// +def X86MemAsmOperand : AsmOperandClass { + let Name = "Mem"; let PredicateMethod = "isMem"; +} +def X86Mem8AsmOperand : AsmOperandClass { + let Name = "Mem8"; let PredicateMethod = "isMem8"; +} +def X86Mem16AsmOperand : AsmOperandClass { + let Name = "Mem16"; let PredicateMethod = "isMem16"; +} +def X86Mem32AsmOperand : AsmOperandClass { + let Name = "Mem32"; let PredicateMethod = "isMem32"; +} +def X86Mem64AsmOperand : AsmOperandClass { + let Name = "Mem64"; let PredicateMethod = "isMem64"; +} +def X86Mem80AsmOperand : AsmOperandClass { + let Name = "Mem80"; let PredicateMethod = "isMem80"; +} +def X86Mem128AsmOperand : AsmOperandClass { + let Name = "Mem128"; let PredicateMethod = "isMem128"; +} +def X86Mem256AsmOperand : AsmOperandClass { + let Name = "Mem256"; let PredicateMethod = "isMem256"; +} + +// Gather mem operands +def X86MemVX32Operand : AsmOperandClass { + let Name = "MemVX32"; let PredicateMethod = "isMemVX32"; +} +def X86MemVY32Operand : AsmOperandClass { + let Name = "MemVY32"; let PredicateMethod = "isMemVY32"; +} +def X86MemVX64Operand : AsmOperandClass { + let Name = "MemVX64"; let PredicateMethod = "isMemVX64"; +} +def X86MemVY64Operand : AsmOperandClass { + let Name = "MemVY64"; let PredicateMethod = "isMemVY64"; +} + +def X86AbsMemAsmOperand : AsmOperandClass { + let Name = "AbsMem"; + let SuperClasses = [X86MemAsmOperand]; +} +class X86MemOperand<string printMethod> : Operand<iPTR> { + let PrintMethod = printMethod; + let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm); + let ParserMatchClass = X86MemAsmOperand; +} + +let OperandType = "OPERAND_MEMORY" in { +def opaque32mem : X86MemOperand<"printopaquemem">; +def opaque48mem : X86MemOperand<"printopaquemem">; +def opaque80mem : X86MemOperand<"printopaquemem">; +def opaque512mem : X86MemOperand<"printopaquemem">; + +def i8mem : X86MemOperand<"printi8mem"> { + let ParserMatchClass = X86Mem8AsmOperand; } +def i16mem : X86MemOperand<"printi16mem"> { + let ParserMatchClass = X86Mem16AsmOperand; } +def i32mem : X86MemOperand<"printi32mem"> { + let ParserMatchClass = X86Mem32AsmOperand; } +def i64mem : X86MemOperand<"printi64mem"> { + let ParserMatchClass = X86Mem64AsmOperand; } +def i128mem : X86MemOperand<"printi128mem"> { + let ParserMatchClass = X86Mem128AsmOperand; } +def i256mem : X86MemOperand<"printi256mem"> { + let ParserMatchClass = X86Mem256AsmOperand; } +def f32mem : X86MemOperand<"printf32mem"> { + let ParserMatchClass = X86Mem32AsmOperand; } +def f64mem : X86MemOperand<"printf64mem"> { + let ParserMatchClass = X86Mem64AsmOperand; } +def f80mem : X86MemOperand<"printf80mem"> { + let ParserMatchClass = X86Mem80AsmOperand; } +def f128mem : X86MemOperand<"printf128mem"> { + let ParserMatchClass = X86Mem128AsmOperand; } +def f256mem : X86MemOperand<"printf256mem">{ + let ParserMatchClass = X86Mem256AsmOperand; } + +// Gather mem operands +def vx32mem : X86MemOperand<"printi32mem">{ + let MIOperandInfo = (ops ptr_rc, i8imm, VR128, i32imm, i8imm); + let ParserMatchClass = X86MemVX32Operand; } +def vy32mem : X86MemOperand<"printi32mem">{ + let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm); + let ParserMatchClass = X86MemVY32Operand; } +def vx64mem : X86MemOperand<"printi64mem">{ + let MIOperandInfo = (ops ptr_rc, i8imm, VR128, i32imm, i8imm); + let ParserMatchClass = X86MemVX64Operand; } +def vy64mem : X86MemOperand<"printi64mem">{ + let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm); + let ParserMatchClass = X86MemVY64Operand; } +} + +// A version of i8mem for use on x86-64 that uses GR64_NOREX instead of +// plain GR64, so that it doesn't potentially require a REX prefix. +def i8mem_NOREX : Operand<i64> { + let PrintMethod = "printi8mem"; + let MIOperandInfo = (ops GR64_NOREX, i8imm, GR64_NOREX_NOSP, i32imm, i8imm); + let ParserMatchClass = X86Mem8AsmOperand; + let OperandType = "OPERAND_MEMORY"; +} + +// GPRs available for tailcall. +// It represents GR32_TC, GR64_TC or GR64_TCW64. +def ptr_rc_tailcall : PointerLikeRegClass<2>; + +// Special i32mem for addresses of load folding tail calls. These are not +// allowed to use callee-saved registers since they must be scheduled +// after callee-saved register are popped. +def i32mem_TC : Operand<i32> { + let PrintMethod = "printi32mem"; + let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall, + i32imm, i8imm); + let ParserMatchClass = X86Mem32AsmOperand; + let OperandType = "OPERAND_MEMORY"; +} + +// Special i64mem for addresses of load folding tail calls. These are not +// allowed to use callee-saved registers since they must be scheduled +// after callee-saved register are popped. +def i64mem_TC : Operand<i64> { + let PrintMethod = "printi64mem"; + let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, + ptr_rc_tailcall, i32imm, i8imm); + let ParserMatchClass = X86Mem64AsmOperand; + let OperandType = "OPERAND_MEMORY"; +} + +let OperandType = "OPERAND_PCREL", + ParserMatchClass = X86AbsMemAsmOperand, + PrintMethod = "printPCRelImm" in { +def i32imm_pcrel : Operand<i32>; +def i16imm_pcrel : Operand<i16>; + +def offset8 : Operand<i64>; +def offset16 : Operand<i64>; +def offset32 : Operand<i64>; +def offset64 : Operand<i64>; + +// Branch targets have OtherVT type and print as pc-relative values. +def brtarget : Operand<OtherVT>; +def brtarget8 : Operand<OtherVT>; + +} + +def SSECC : Operand<i8> { + let PrintMethod = "printSSECC"; + let OperandType = "OPERAND_IMMEDIATE"; +} + +def AVXCC : Operand<i8> { + let PrintMethod = "printAVXCC"; + let OperandType = "OPERAND_IMMEDIATE"; +} + +class ImmSExtAsmOperandClass : AsmOperandClass { + let SuperClasses = [ImmAsmOperand]; + let RenderMethod = "addImmOperands"; +} + +class ImmZExtAsmOperandClass : AsmOperandClass { + let SuperClasses = [ImmAsmOperand]; + let RenderMethod = "addImmOperands"; +} + +// Sign-extended immediate classes. We don't need to define the full lattice +// here because there is no instruction with an ambiguity between ImmSExti64i32 +// and ImmSExti32i8. +// +// The strange ranges come from the fact that the assembler always works with +// 64-bit immediates, but for a 16-bit target value we want to accept both "-1" +// (which will be a -1ULL), and "0xFF" (-1 in 16-bits). + +// [0, 0x7FFFFFFF] | +// [0xFFFFFFFF80000000, 0xFFFFFFFFFFFFFFFF] +def ImmSExti64i32AsmOperand : ImmSExtAsmOperandClass { + let Name = "ImmSExti64i32"; +} + +// [0, 0x0000007F] | [0x000000000000FF80, 0x000000000000FFFF] | +// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] +def ImmSExti16i8AsmOperand : ImmSExtAsmOperandClass { + let Name = "ImmSExti16i8"; + let SuperClasses = [ImmSExti64i32AsmOperand]; +} + +// [0, 0x0000007F] | [0x00000000FFFFFF80, 0x00000000FFFFFFFF] | +// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] +def ImmSExti32i8AsmOperand : ImmSExtAsmOperandClass { + let Name = "ImmSExti32i8"; +} + +// [0, 0x000000FF] +def ImmZExtu32u8AsmOperand : ImmZExtAsmOperandClass { + let Name = "ImmZExtu32u8"; +} + + +// [0, 0x0000007F] | +// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] +def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass { + let Name = "ImmSExti64i8"; + let SuperClasses = [ImmSExti16i8AsmOperand, ImmSExti32i8AsmOperand, + ImmSExti64i32AsmOperand]; +} + +// A couple of more descriptive operand definitions. +// 16-bits but only 8 bits are significant. +def i16i8imm : Operand<i16> { + let ParserMatchClass = ImmSExti16i8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} +// 32-bits but only 8 bits are significant. +def i32i8imm : Operand<i32> { + let ParserMatchClass = ImmSExti32i8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} +// 32-bits but only 8 bits are significant, and those 8 bits are unsigned. +def u32u8imm : Operand<i32> { + let ParserMatchClass = ImmZExtu32u8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// 64-bits but only 32 bits are significant. +def i64i32imm : Operand<i64> { + let ParserMatchClass = ImmSExti64i32AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// 64-bits but only 32 bits are significant, and those bits are treated as being +// pc relative. +def i64i32imm_pcrel : Operand<i64> { + let PrintMethod = "printPCRelImm"; + let ParserMatchClass = X86AbsMemAsmOperand; + let OperandType = "OPERAND_PCREL"; +} + +// 64-bits but only 8 bits are significant. +def i64i8imm : Operand<i64> { + let ParserMatchClass = ImmSExti64i8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +def lea64_32mem : Operand<i32> { + let PrintMethod = "printi32mem"; + let AsmOperandLowerMethod = "lower_lea64_32mem"; + let MIOperandInfo = (ops GR32, i8imm, GR32_NOSP, i32imm, i8imm); + let ParserMatchClass = X86MemAsmOperand; +} + +// Memory operands that use 64-bit pointers in both ILP32 and LP64. +def lea64mem : Operand<i64> { + let PrintMethod = "printi64mem"; + let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm); + let ParserMatchClass = X86MemAsmOperand; +} + + +//===----------------------------------------------------------------------===// +// X86 Complex Pattern Definitions. +// + +// Define X86 specific addressing mode. +def addr : ComplexPattern<iPTR, 5, "SelectAddr", [], [SDNPWantParent]>; +def lea32addr : ComplexPattern<i32, 5, "SelectLEAAddr", + [add, sub, mul, X86mul_imm, shl, or, frameindex], + []>; +// In 64-bit mode 32-bit LEAs can use RIP-relative addressing. +def lea64_32addr : ComplexPattern<i32, 5, "SelectLEAAddr", + [add, sub, mul, X86mul_imm, shl, or, + frameindex, X86WrapperRIP], + []>; + +def tls32addr : ComplexPattern<i32, 5, "SelectTLSADDRAddr", + [tglobaltlsaddr], []>; + +def tls32baseaddr : ComplexPattern<i32, 5, "SelectTLSADDRAddr", + [tglobaltlsaddr], []>; + +def lea64addr : ComplexPattern<i64, 5, "SelectLEAAddr", + [add, sub, mul, X86mul_imm, shl, or, frameindex, + X86WrapperRIP], []>; + +def tls64addr : ComplexPattern<i64, 5, "SelectTLSADDRAddr", + [tglobaltlsaddr], []>; + +def tls64baseaddr : ComplexPattern<i64, 5, "SelectTLSADDRAddr", + [tglobaltlsaddr], []>; + +//===----------------------------------------------------------------------===// +// X86 Instruction Predicate Definitions. +def HasCMov : Predicate<"Subtarget->hasCMov()">; +def NoCMov : Predicate<"!Subtarget->hasCMov()">; + +def HasMMX : Predicate<"Subtarget->hasMMX()">; +def Has3DNow : Predicate<"Subtarget->has3DNow()">; +def Has3DNowA : Predicate<"Subtarget->has3DNowA()">; +def HasSSE1 : Predicate<"Subtarget->hasSSE1()">; +def UseSSE1 : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">; +def HasSSE2 : Predicate<"Subtarget->hasSSE2()">; +def UseSSE2 : Predicate<"Subtarget->hasSSE2() && !Subtarget->hasAVX()">; +def HasSSE3 : Predicate<"Subtarget->hasSSE3()">; +def UseSSE3 : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">; +def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">; +def UseSSSE3 : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">; +def HasSSE41 : Predicate<"Subtarget->hasSSE41()">; +def UseSSE41 : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">; +def HasSSE42 : Predicate<"Subtarget->hasSSE42()">; +def UseSSE42 : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">; +def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">; +def HasAVX : Predicate<"Subtarget->hasAVX()">; +def HasAVX2 : Predicate<"Subtarget->hasAVX2()">; +def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">; + +def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; +def HasAES : Predicate<"Subtarget->hasAES()">; +def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">; +def HasFMA : Predicate<"Subtarget->hasFMA()">; +def HasFMA4 : Predicate<"Subtarget->hasFMA4()">; +def HasXOP : Predicate<"Subtarget->hasXOP()">; +def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">; +def HasRDRAND : Predicate<"Subtarget->hasRDRAND()">; +def HasF16C : Predicate<"Subtarget->hasF16C()">; +def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">; +def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">; +def HasBMI : Predicate<"Subtarget->hasBMI()">; +def HasBMI2 : Predicate<"Subtarget->hasBMI2()">; +def HasRTM : Predicate<"Subtarget->hasRTM()">; +def HasHLE : Predicate<"Subtarget->hasHLE()">; +def HasTSX : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">; +def HasADX : Predicate<"Subtarget->hasADX()">; +def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">; +def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">; +def HasPrefetchW : Predicate<"Subtarget->has3DNow() || Subtarget->hasPRFCHW()">; +def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; +def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; +def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">; +def In32BitMode : Predicate<"!Subtarget->is64Bit()">, + AssemblerPredicate<"!Mode64Bit", "32-bit mode">; +def In64BitMode : Predicate<"Subtarget->is64Bit()">, + AssemblerPredicate<"Mode64Bit", "64-bit mode">; +def IsWin64 : Predicate<"Subtarget->isTargetWin64()">; +def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; +def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">; +def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">; +def KernelCode : Predicate<"TM.getCodeModel() == CodeModel::Kernel">; +def FarData : Predicate<"TM.getCodeModel() != CodeModel::Small &&" + "TM.getCodeModel() != CodeModel::Kernel">; +def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||" + "TM.getCodeModel() == CodeModel::Kernel">; +def IsStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">; +def IsNotPIC : Predicate<"TM.getRelocationModel() != Reloc::PIC_">; +def OptForSize : Predicate<"OptForSize">; +def OptForSpeed : Predicate<"!OptForSize">; +def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">; +def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">; +def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">; + +//===----------------------------------------------------------------------===// +// X86 Instruction Format Definitions. +// + +include "X86InstrFormats.td" + +//===----------------------------------------------------------------------===// +// Pattern fragments. +// + +// X86 specific condition code. These correspond to CondCode in +// X86InstrInfo.h. They must be kept in synch. +def X86_COND_A : PatLeaf<(i8 0)>; // alt. COND_NBE +def X86_COND_AE : PatLeaf<(i8 1)>; // alt. COND_NC +def X86_COND_B : PatLeaf<(i8 2)>; // alt. COND_C +def X86_COND_BE : PatLeaf<(i8 3)>; // alt. COND_NA +def X86_COND_E : PatLeaf<(i8 4)>; // alt. COND_Z +def X86_COND_G : PatLeaf<(i8 5)>; // alt. COND_NLE +def X86_COND_GE : PatLeaf<(i8 6)>; // alt. COND_NL +def X86_COND_L : PatLeaf<(i8 7)>; // alt. COND_NGE +def X86_COND_LE : PatLeaf<(i8 8)>; // alt. COND_NG +def X86_COND_NE : PatLeaf<(i8 9)>; // alt. COND_NZ +def X86_COND_NO : PatLeaf<(i8 10)>; +def X86_COND_NP : PatLeaf<(i8 11)>; // alt. COND_PO +def X86_COND_NS : PatLeaf<(i8 12)>; +def X86_COND_O : PatLeaf<(i8 13)>; +def X86_COND_P : PatLeaf<(i8 14)>; // alt. COND_PE +def X86_COND_S : PatLeaf<(i8 15)>; + +let FastIselShouldIgnore = 1 in { // FastIsel should ignore all simm8 instrs. + def i16immSExt8 : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>; + def i32immSExt8 : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>; + def i64immSExt8 : ImmLeaf<i64, [{ return Imm == (int8_t)Imm; }]>; +} + +def i64immSExt32 : ImmLeaf<i64, [{ return Imm == (int32_t)Imm; }]>; + + +// i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit +// unsigned field. +def i64immZExt32 : ImmLeaf<i64, [{ return (uint64_t)Imm == (uint32_t)Imm; }]>; + +def i64immZExt32SExt8 : ImmLeaf<i64, [{ + return (uint64_t)Imm == (uint32_t)Imm && (int32_t)Imm == (int8_t)Imm; +}]>; + +// Helper fragments for loads. +// It's always safe to treat a anyext i16 load as a i32 load if the i16 is +// known to be 32-bit aligned or better. Ditto for i8 to i16. +def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{ + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::LoadExtType ExtType = LD->getExtensionType(); + if (ExtType == ISD::NON_EXTLOAD) + return true; + if (ExtType == ISD::EXTLOAD) + return LD->getAlignment() >= 2 && !LD->isVolatile(); + return false; +}]>; + +def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)),[{ + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::LoadExtType ExtType = LD->getExtensionType(); + if (ExtType == ISD::EXTLOAD) + return LD->getAlignment() >= 2 && !LD->isVolatile(); + return false; +}]>; + +def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{ + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::LoadExtType ExtType = LD->getExtensionType(); + if (ExtType == ISD::NON_EXTLOAD) + return true; + if (ExtType == ISD::EXTLOAD) + return LD->getAlignment() >= 4 && !LD->isVolatile(); + return false; +}]>; + +def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr))>; +def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>; +def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>; +def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>; +def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>; + +def sextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>; +def sextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>; +def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>; +def sextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>; +def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>; +def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>; + +def zextloadi8i1 : PatFrag<(ops node:$ptr), (i8 (zextloadi1 node:$ptr))>; +def zextloadi16i1 : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>; +def zextloadi32i1 : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>; +def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>; +def zextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>; +def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>; +def zextloadi64i1 : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>; +def zextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>; +def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>; +def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>; + +def extloadi8i1 : PatFrag<(ops node:$ptr), (i8 (extloadi1 node:$ptr))>; +def extloadi16i1 : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>; +def extloadi32i1 : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>; +def extloadi16i8 : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>; +def extloadi32i8 : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>; +def extloadi32i16 : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>; +def extloadi64i1 : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>; +def extloadi64i8 : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>; +def extloadi64i16 : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>; +def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>; + + +// An 'and' node with a single use. +def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{ + return N->hasOneUse(); +}]>; +// An 'srl' node with a single use. +def srl_su : PatFrag<(ops node:$lhs, node:$rhs), (srl node:$lhs, node:$rhs), [{ + return N->hasOneUse(); +}]>; +// An 'trunc' node with a single use. +def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{ + return N->hasOneUse(); +}]>; + +//===----------------------------------------------------------------------===// +// Instruction list. +// + +// Nop +let neverHasSideEffects = 1, SchedRW = [WriteZero] in { + def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>; + def NOOPW : I<0x1f, MRM0m, (outs), (ins i16mem:$zero), + "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize; + def NOOPL : I<0x1f, MRM0m, (outs), (ins i32mem:$zero), + "nop{l}\t$zero", [], IIC_NOP>, TB; +} + + +// Constructing a stack frame. +def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl), + "enter\t$len, $lvl", [], IIC_ENTER>, Sched<[WriteMicrocoded]>; + +let SchedRW = [WriteALU] in { +let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, neverHasSideEffects=1 in +def LEAVE : I<0xC9, RawFrm, + (outs), (ins), "leave", [], IIC_LEAVE>, + Requires<[In32BitMode]>; + +let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, neverHasSideEffects = 1 in +def LEAVE64 : I<0xC9, RawFrm, + (outs), (ins), "leave", [], IIC_LEAVE>, + Requires<[In64BitMode]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Miscellaneous Instructions. +// + +let Defs = [ESP], Uses = [ESP], neverHasSideEffects=1 in { +let mayLoad = 1, SchedRW = [WriteLoad] in { +def POP16r : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [], + IIC_POP_REG16>, OpSize; +def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [], + IIC_POP_REG>; +def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [], + IIC_POP_REG>, OpSize; +def POP16rmm: I<0x8F, MRM0m, (outs i16mem:$dst), (ins), "pop{w}\t$dst", [], + IIC_POP_MEM>, OpSize; +def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [], + IIC_POP_REG>; +def POP32rmm: I<0x8F, MRM0m, (outs i32mem:$dst), (ins), "pop{l}\t$dst", [], + IIC_POP_MEM>; + +def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>, OpSize; +def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>, + Requires<[In32BitMode]>; +} // mayLoad, SchedRW + +let mayStore = 1, SchedRW = [WriteStore] in { +def PUSH16r : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[], + IIC_PUSH_REG>, OpSize; +def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[], + IIC_PUSH_REG>; +def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[], + IIC_PUSH_REG>, OpSize; +def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[], + IIC_PUSH_MEM>, + OpSize; +def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[], + IIC_PUSH_REG>; +def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[], + IIC_PUSH_MEM>; + +def PUSHi8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm), + "push{l}\t$imm", [], IIC_PUSH_IMM>; +def PUSHi16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm), + "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize; +def PUSHi32 : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm), + "push{l}\t$imm", [], IIC_PUSH_IMM>; + +def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>, + OpSize; +def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", [], IIC_PUSH_F>, + Requires<[In32BitMode]>; + +} // mayStore, SchedRW +} + +let Defs = [RSP], Uses = [RSP], neverHasSideEffects=1 in { +let mayLoad = 1, SchedRW = [WriteLoad] in { +def POP64r : I<0x58, AddRegFrm, + (outs GR64:$reg), (ins), "pop{q}\t$reg", [], IIC_POP_REG>; +def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [], + IIC_POP_REG>; +def POP64rmm: I<0x8F, MRM0m, (outs i64mem:$dst), (ins), "pop{q}\t$dst", [], + IIC_POP_MEM>; +} // mayLoad, SchedRW +let mayStore = 1, SchedRW = [WriteStore] in { +def PUSH64r : I<0x50, AddRegFrm, + (outs), (ins GR64:$reg), "push{q}\t$reg", [], IIC_PUSH_REG>; +def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [], + IIC_PUSH_REG>; +def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [], + IIC_PUSH_MEM>; +} // mayStore, SchedRW +} + +let Defs = [RSP], Uses = [RSP], neverHasSideEffects = 1, mayStore = 1, + SchedRW = [WriteStore] in { +def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm), + "push{q}\t$imm", [], IIC_PUSH_IMM>; +def PUSH64i16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm), + "push{q}\t$imm", [], IIC_PUSH_IMM>; +def PUSH64i32 : Ii32<0x68, RawFrm, (outs), (ins i64i32imm:$imm), + "push{q}\t$imm", [], IIC_PUSH_IMM>; +} + +let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, neverHasSideEffects=1 in +def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>, + Requires<[In64BitMode]>, Sched<[WriteLoad]>; +let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, neverHasSideEffects=1 in +def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>, + Requires<[In64BitMode]>, Sched<[WriteStore]>; + +let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP], + mayLoad = 1, neverHasSideEffects = 1, SchedRW = [WriteLoad] in { +def POPA32 : I<0x61, RawFrm, (outs), (ins), "popa{l|d}", [], IIC_POP_A>, + Requires<[In32BitMode]>; +} +let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], + mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in { +def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pusha{l|d}", [], IIC_PUSH_A>, + Requires<[In32BitMode]>; +} + +let Constraints = "$src = $dst", SchedRW = [WriteALU] in { +// GR32 = bswap GR32 +def BSWAP32r : I<0xC8, AddRegFrm, + (outs GR32:$dst), (ins GR32:$src), + "bswap{l}\t$dst", + [(set GR32:$dst, (bswap GR32:$src))], IIC_BSWAP>, TB; + +def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src), + "bswap{q}\t$dst", + [(set GR64:$dst, (bswap GR64:$src))], IIC_BSWAP>, TB; +} // Constraints = "$src = $dst", SchedRW + +// Bit scan instructions. +let Defs = [EFLAGS] in { +def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "bsf{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))], + IIC_BSF>, TB, OpSize, Sched<[WriteShift]>; +def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "bsf{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))], + IIC_BSF>, TB, OpSize, Sched<[WriteShiftLd]>; +def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "bsf{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))], IIC_BSF>, TB, + Sched<[WriteShift]>; +def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "bsf{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))], + IIC_BSF>, TB, Sched<[WriteShiftLd]>; +def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "bsf{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))], + IIC_BSF>, TB, Sched<[WriteShift]>; +def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "bsf{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))], + IIC_BSF>, TB, Sched<[WriteShiftLd]>; + +def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "bsr{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))], IIC_BSR>, + TB, OpSize, Sched<[WriteShift]>; +def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "bsr{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))], + IIC_BSR>, TB, + OpSize, Sched<[WriteShiftLd]>; +def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "bsr{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))], IIC_BSR>, TB, + Sched<[WriteShift]>; +def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "bsr{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))], + IIC_BSR>, TB, Sched<[WriteShiftLd]>; +def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "bsr{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))], IIC_BSR>, TB, + Sched<[WriteShift]>; +def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "bsr{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))], + IIC_BSR>, TB, Sched<[WriteShiftLd]>; +} // Defs = [EFLAGS] + +let SchedRW = [WriteMicrocoded] in { +// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI +let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in { +def MOVSB : I<0xA4, RawFrm, (outs), (ins), "movsb", [], IIC_MOVS>; +def MOVSW : I<0xA5, RawFrm, (outs), (ins), "movsw", [], IIC_MOVS>, OpSize; +def MOVSD : I<0xA5, RawFrm, (outs), (ins), "movs{l|d}", [], IIC_MOVS>; +def MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "movsq", [], IIC_MOVS>; +} + +// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI +let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in +def STOSB : I<0xAA, RawFrm, (outs), (ins), "stosb", [], IIC_STOS>; +let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in +def STOSW : I<0xAB, RawFrm, (outs), (ins), "stosw", [], IIC_STOS>, OpSize; +let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in +def STOSD : I<0xAB, RawFrm, (outs), (ins), "stos{l|d}", [], IIC_STOS>; +let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI,EFLAGS] in +def STOSQ : RI<0xAB, RawFrm, (outs), (ins), "stosq", [], IIC_STOS>; + +def SCAS8 : I<0xAE, RawFrm, (outs), (ins), "scasb", [], IIC_SCAS>; +def SCAS16 : I<0xAF, RawFrm, (outs), (ins), "scasw", [], IIC_SCAS>, OpSize; +def SCAS32 : I<0xAF, RawFrm, (outs), (ins), "scas{l|d}", [], IIC_SCAS>; +def SCAS64 : RI<0xAF, RawFrm, (outs), (ins), "scasq", [], IIC_SCAS>; + +def CMPS8 : I<0xA6, RawFrm, (outs), (ins), "cmpsb", [], IIC_CMPS>; +def CMPS16 : I<0xA7, RawFrm, (outs), (ins), "cmpsw", [], IIC_CMPS>, OpSize; +def CMPS32 : I<0xA7, RawFrm, (outs), (ins), "cmps{l|d}", [], IIC_CMPS>; +def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmpsq", [], IIC_CMPS>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Move Instructions. +// +let SchedRW = [WriteMove] in { +let neverHasSideEffects = 1 in { +def MOV8rr : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src), + "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; +def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize; +def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; +def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; +} + +let isReMaterializable = 1, isAsCheapAsAMove = 1 in { +def MOV8ri : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src), + "mov{b}\t{$src, $dst|$dst, $src}", + [(set GR8:$dst, imm:$src)], IIC_MOV>; +def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src), + "mov{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, imm:$src)], IIC_MOV>, OpSize; +def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src), + "mov{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, imm:$src)], IIC_MOV>; +def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src), + "movabs{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, imm:$src)], IIC_MOV>; +def MOV64ri32 : RIi32<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src), + "mov{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, i64immSExt32:$src)], IIC_MOV>; +} +} // SchedRW + +let SchedRW = [WriteStore] in { +def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src), + "mov{b}\t{$src, $dst|$dst, $src}", + [(store (i8 imm:$src), addr:$dst)], IIC_MOV_MEM>; +def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src), + "mov{w}\t{$src, $dst|$dst, $src}", + [(store (i16 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize; +def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src), + "mov{l}\t{$src, $dst|$dst, $src}", + [(store (i32 imm:$src), addr:$dst)], IIC_MOV_MEM>; +def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src), + "mov{q}\t{$src, $dst|$dst, $src}", + [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>; +} // SchedRW + +/// moffs8, moffs16 and moffs32 versions of moves. The immediate is a +/// 32-bit offset from the PC. These are only valid in x86-32 mode. +let SchedRW = [WriteALU] in { +def MOV8o8a : Ii32 <0xA0, RawFrm, (outs), (ins offset8:$src), + "mov{b}\t{$src, %al|AL, $src}", [], IIC_MOV_MEM>, + Requires<[In32BitMode]>; +def MOV16o16a : Ii32 <0xA1, RawFrm, (outs), (ins offset16:$src), + "mov{w}\t{$src, %ax|AL, $src}", [], IIC_MOV_MEM>, OpSize, + Requires<[In32BitMode]>; +def MOV32o32a : Ii32 <0xA1, RawFrm, (outs), (ins offset32:$src), + "mov{l}\t{$src, %eax|EAX, $src}", [], IIC_MOV_MEM>, + Requires<[In32BitMode]>; +def MOV8ao8 : Ii32 <0xA2, RawFrm, (outs offset8:$dst), (ins), + "mov{b}\t{%al, $dst|$dst, AL}", [], IIC_MOV_MEM>, + Requires<[In32BitMode]>; +def MOV16ao16 : Ii32 <0xA3, RawFrm, (outs offset16:$dst), (ins), + "mov{w}\t{%ax, $dst|$dst, AL}", [], IIC_MOV_MEM>, OpSize, + Requires<[In32BitMode]>; +def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins), + "mov{l}\t{%eax, $dst|$dst, EAX}", [], IIC_MOV_MEM>, + Requires<[In32BitMode]>; +} + +// FIXME: These definitions are utterly broken +// Just leave them commented out for now because they're useless outside +// of the large code model, and most compilers won't generate the instructions +// in question. +/* +def MOV64o8a : RIi8<0xA0, RawFrm, (outs), (ins offset8:$src), + "mov{q}\t{$src, %rax|RAX, $src}", []>; +def MOV64o64a : RIi32<0xA1, RawFrm, (outs), (ins offset64:$src), + "mov{q}\t{$src, %rax|RAX, $src}", []>; +def MOV64ao8 : RIi8<0xA2, RawFrm, (outs offset8:$dst), (ins), + "mov{q}\t{%rax, $dst|$dst, RAX}", []>; +def MOV64ao64 : RIi32<0xA3, RawFrm, (outs offset64:$dst), (ins), + "mov{q}\t{%rax, $dst|$dst, RAX}", []>; +*/ + + +let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in { +def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src), + "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; +def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize; +def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; +def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; +} + +let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in { +def MOV8rm : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src), + "mov{b}\t{$src, $dst|$dst, $src}", + [(set GR8:$dst, (loadi8 addr:$src))], IIC_MOV_MEM>; +def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "mov{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (loadi16 addr:$src))], IIC_MOV_MEM>, OpSize; +def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "mov{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (loadi32 addr:$src))], IIC_MOV_MEM>; +def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "mov{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (load addr:$src))], IIC_MOV_MEM>; +} + +let SchedRW = [WriteStore] in { +def MOV8mr : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src), + "mov{b}\t{$src, $dst|$dst, $src}", + [(store GR8:$src, addr:$dst)], IIC_MOV_MEM>; +def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), + "mov{w}\t{$src, $dst|$dst, $src}", + [(store GR16:$src, addr:$dst)], IIC_MOV_MEM>, OpSize; +def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", + [(store GR32:$src, addr:$dst)], IIC_MOV_MEM>; +def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", + [(store GR64:$src, addr:$dst)], IIC_MOV_MEM>; +} // SchedRW + +// Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so +// that they can be used for copying and storing h registers, which can't be +// encoded when a REX prefix is present. +let isCodeGenOnly = 1 in { +let neverHasSideEffects = 1 in +def MOV8rr_NOREX : I<0x88, MRMDestReg, + (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src), + "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOV>, + Sched<[WriteMove]>; +let mayStore = 1 in +def MOV8mr_NOREX : I<0x88, MRMDestMem, + (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src), + "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], + IIC_MOV_MEM>, Sched<[WriteStore]>; +let mayLoad = 1, neverHasSideEffects = 1, + canFoldAsLoad = 1, isReMaterializable = 1 in +def MOV8rm_NOREX : I<0x8A, MRMSrcMem, + (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src), + "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], + IIC_MOV_MEM>, Sched<[WriteLoad]>; +} + + +// Condition code ops, incl. set if equal/not equal/... +let SchedRW = [WriteALU] in { +let Defs = [EFLAGS], Uses = [AH] in +def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf", + [(set EFLAGS, (X86sahf AH))], IIC_AHF>; +let Defs = [AH], Uses = [EFLAGS], neverHasSideEffects = 1 in +def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", [], + IIC_AHF>; // AH = flags +} // SchedRW + +//===----------------------------------------------------------------------===// +// Bit tests instructions: BT, BTS, BTR, BTC. + +let Defs = [EFLAGS] in { +let SchedRW = [WriteALU] in { +def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), + "bt{w}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))], IIC_BT_RR>, + OpSize, TB; +def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), + "bt{l}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))], IIC_BT_RR>, TB; +def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), + "bt{q}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB; +} // SchedRW + +// Unlike with the register+register form, the memory+register form of the +// bt instruction does not ignore the high bits of the index. From ISel's +// perspective, this is pretty bizarre. Make these instructions disassembly +// only for now. + +let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteALULd] in { + def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), + "bt{w}\t{$src2, $src1|$src1, $src2}", + // [(X86bt (loadi16 addr:$src1), GR16:$src2), + // (implicit EFLAGS)] + [], IIC_BT_MR + >, OpSize, TB, Requires<[FastBTMem]>; + def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), + "bt{l}\t{$src2, $src1|$src1, $src2}", + // [(X86bt (loadi32 addr:$src1), GR32:$src2), + // (implicit EFLAGS)] + [], IIC_BT_MR + >, TB, Requires<[FastBTMem]>; + def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), + "bt{q}\t{$src2, $src1|$src1, $src2}", + // [(X86bt (loadi64 addr:$src1), GR64:$src2), + // (implicit EFLAGS)] + [], IIC_BT_MR + >, TB; +} + +let SchedRW = [WriteALU] in { +def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2), + "bt{w}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))], + IIC_BT_RI>, OpSize, TB; +def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2), + "bt{l}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))], + IIC_BT_RI>, TB; +def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2), + "bt{q}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))], + IIC_BT_RI>, TB; +} // SchedRW + +// Note that these instructions don't need FastBTMem because that +// only applies when the other operand is in a register. When it's +// an immediate, bt is still fast. +let SchedRW = [WriteALU] in { +def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2), + "bt{w}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt (loadi16 addr:$src1), i16immSExt8:$src2)) + ], IIC_BT_MI>, OpSize, TB; +def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2), + "bt{l}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt (loadi32 addr:$src1), i32immSExt8:$src2)) + ], IIC_BT_MI>, TB; +def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2), + "bt{q}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt (loadi64 addr:$src1), + i64immSExt8:$src2))], IIC_BT_MI>, TB; +} // SchedRW + +let hasSideEffects = 0 in { +let SchedRW = [WriteALU] in { +def BTC16rr : I<0xBB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), + "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, + OpSize, TB; +def BTC32rr : I<0xBB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), + "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; +def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), + "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), + "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, + OpSize, TB; +def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), + "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; +def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), + "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; +} + +let SchedRW = [WriteALU] in { +def BTC16ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR16:$src1, i16i8imm:$src2), + "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, + OpSize, TB; +def BTC32ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR32:$src1, i32i8imm:$src2), + "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; +def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2), + "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2), + "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, + OpSize, TB; +def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2), + "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; +def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2), + "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; +} + +let SchedRW = [WriteALU] in { +def BTR16rr : I<0xB3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), + "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, + OpSize, TB; +def BTR32rr : I<0xB3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), + "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; +def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), + "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), + "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, + OpSize, TB; +def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), + "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; +def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), + "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; +} + +let SchedRW = [WriteALU] in { +def BTR16ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR16:$src1, i16i8imm:$src2), + "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, + OpSize, TB; +def BTR32ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR32:$src1, i32i8imm:$src2), + "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; +def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2), + "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2), + "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, + OpSize, TB; +def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2), + "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; +def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2), + "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; +} + +let SchedRW = [WriteALU] in { +def BTS16rr : I<0xAB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), + "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, + OpSize, TB; +def BTS32rr : I<0xAB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), + "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; +def BTS64rr : RI<0xAB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), + "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), + "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, + OpSize, TB; +def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), + "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; +def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), + "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; +} + +let SchedRW = [WriteALU] in { +def BTS16ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR16:$src1, i16i8imm:$src2), + "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, + OpSize, TB; +def BTS32ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR32:$src1, i32i8imm:$src2), + "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; +def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2), + "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2), + "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, + OpSize, TB; +def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2), + "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; +def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2), + "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; +} +} // hasSideEffects = 0 +} // Defs = [EFLAGS] + + +//===----------------------------------------------------------------------===// +// Atomic support +// + +// Atomic swap. These are just normal xchg instructions. But since a memory +// operand is referenced, the atomicity is ensured. +multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag, + InstrItinClass itin> { + let Constraints = "$val = $dst", SchedRW = [WriteALULd, WriteRMW] in { + def NAME#8rm : I<opc8, MRMSrcMem, (outs GR8:$dst), + (ins GR8:$val, i8mem:$ptr), + !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"), + [(set + GR8:$dst, + (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))], + itin>; + def NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst), + (ins GR16:$val, i16mem:$ptr), + !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"), + [(set + GR16:$dst, + (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))], + itin>, OpSize; + def NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$val, i32mem:$ptr), + !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"), + [(set + GR32:$dst, + (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))], + itin>; + def NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$val, i64mem:$ptr), + !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"), + [(set + GR64:$dst, + (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))], + itin>; + } +} + +defm XCHG : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap", IIC_XCHG_MEM>; + +// Swap between registers. +let SchedRW = [WriteALU] in { +let Constraints = "$val = $dst" in { +def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src), + "xchg{b}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>; +def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src), + "xchg{w}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>, OpSize; +def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst), (ins GR32:$val, GR32:$src), + "xchg{l}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>; +def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src), + "xchg{q}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>; +} + +// Swap between EAX and other registers. +def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src), + "xchg{w}\t{$src, %ax|AX, $src}", [], IIC_XCHG_REG>, OpSize; +def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src), + "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>, + Requires<[In32BitMode]>; +// Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding. +// xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP. +def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src), + "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>, + Requires<[In64BitMode]>; +def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src), + "xchg{q}\t{$src, %rax|RAX, $src}", [], IIC_XCHG_REG>; +} // SchedRW + +let SchedRW = [WriteALU] in { +def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src), + "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB; +def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), + "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB, + OpSize; +def XADD32rr : I<0xC1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), + "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB; +def XADD64rr : RI<0xC1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), + "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def XADD8rm : I<0xC0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src), + "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB; +def XADD16rm : I<0xC1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), + "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB, + OpSize; +def XADD32rm : I<0xC1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB; +def XADD64rm : RI<0xC1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB; + +} + +let SchedRW = [WriteALU] in { +def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src), + "cmpxchg{b}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_REG8>, TB; +def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), + "cmpxchg{w}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_REG>, TB, OpSize; +def CMPXCHG32rr : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), + "cmpxchg{l}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_REG>, TB; +def CMPXCHG64rr : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), + "cmpxchg{q}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_REG>, TB; +} // SchedRW + +let SchedRW = [WriteALULd, WriteRMW] in { +let mayLoad = 1, mayStore = 1 in { +def CMPXCHG8rm : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src), + "cmpxchg{b}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_MEM8>, TB; +def CMPXCHG16rm : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), + "cmpxchg{w}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_MEM>, TB, OpSize; +def CMPXCHG32rm : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "cmpxchg{l}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_MEM>, TB; +def CMPXCHG64rm : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "cmpxchg{q}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_MEM>, TB; +} + +let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in +def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst), + "cmpxchg8b\t$dst", [], IIC_CMPXCHG_8B>, TB; + +let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in +def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst), + "cmpxchg16b\t$dst", [], IIC_CMPXCHG_16B>, + TB, Requires<[HasCmpxchg16b]>; +} // SchedRW + + +// Lock instruction prefix +def LOCK_PREFIX : I<0xF0, RawFrm, (outs), (ins), "lock", []>; + +// Rex64 instruction prefix +def REX64_PREFIX : I<0x48, RawFrm, (outs), (ins), "rex64", []>; + +// Data16 instruction prefix +def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", []>; + +// Repeat string operation instruction prefixes +// These uses the DF flag in the EFLAGS register to inc or dec ECX +let Defs = [ECX], Uses = [ECX,EFLAGS] in { +// Repeat (used with INS, OUTS, MOVS, LODS and STOS) +def REP_PREFIX : I<0xF3, RawFrm, (outs), (ins), "rep", []>; +// Repeat while not equal (used with CMPS and SCAS) +def REPNE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "repne", []>; +} + + +// String manipulation instructions +let SchedRW = [WriteMicrocoded] in { +def LODSB : I<0xAC, RawFrm, (outs), (ins), "lodsb", [], IIC_LODS>; +def LODSW : I<0xAD, RawFrm, (outs), (ins), "lodsw", [], IIC_LODS>, OpSize; +def LODSD : I<0xAD, RawFrm, (outs), (ins), "lods{l|d}", [], IIC_LODS>; +def LODSQ : RI<0xAD, RawFrm, (outs), (ins), "lodsq", [], IIC_LODS>; +} + +let SchedRW = [WriteSystem] in { +def OUTSB : I<0x6E, RawFrm, (outs), (ins), "outsb", [], IIC_OUTS>; +def OUTSW : I<0x6F, RawFrm, (outs), (ins), "outsw", [], IIC_OUTS>, OpSize; +def OUTSD : I<0x6F, RawFrm, (outs), (ins), "outs{l|d}", [], IIC_OUTS>; +} + +// Flag instructions +let SchedRW = [WriteALU] in { +def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", [], IIC_CLC>; +def STC : I<0xF9, RawFrm, (outs), (ins), "stc", [], IIC_STC>; +def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", [], IIC_CLI>; +def STI : I<0xFB, RawFrm, (outs), (ins), "sti", [], IIC_STI>; +def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", [], IIC_CLD>; +def STD : I<0xFD, RawFrm, (outs), (ins), "std", [], IIC_STD>; +def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", [], IIC_CMC>; + +def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB; +} + +// Table lookup instructions +def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>, + Sched<[WriteLoad]>; + +let SchedRW = [WriteMicrocoded] in { +// ASCII Adjust After Addition +// sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS +def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>, + Requires<[In32BitMode]>; + +// ASCII Adjust AX Before Division +// sets AL, AH and EFLAGS and uses AL and AH +def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src), + "aad\t$src", [], IIC_AAD>, Requires<[In32BitMode]>; + +// ASCII Adjust AX After Multiply +// sets AL, AH and EFLAGS and uses AL +def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src), + "aam\t$src", [], IIC_AAM>, Requires<[In32BitMode]>; + +// ASCII Adjust AL After Subtraction - sets +// sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS +def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", [], IIC_AAS>, + Requires<[In32BitMode]>; + +// Decimal Adjust AL after Addition +// sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS +def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>, + Requires<[In32BitMode]>; + +// Decimal Adjust AL after Subtraction +// sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS +def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>, + Requires<[In32BitMode]>; +} // SchedRW + +let SchedRW = [WriteSystem] in { +// Check Array Index Against Bounds +def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize, + Requires<[In32BitMode]>; +def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, + Requires<[In32BitMode]>; + +// Adjust RPL Field of Segment Selector +def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), + "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_REG>, + Requires<[In32BitMode]>; +def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), + "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>, + Requires<[In32BitMode]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// MOVBE Instructions +// +let Predicates = [HasMOVBE] in { + let SchedRW = [WriteALULd] in { + def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "movbe{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (bswap (loadi16 addr:$src)))], IIC_MOVBE>, + OpSize, T8; + def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "movbe{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (bswap (loadi32 addr:$src)))], IIC_MOVBE>, + T8; + def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "movbe{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (bswap (loadi64 addr:$src)))], IIC_MOVBE>, + T8; + } + let SchedRW = [WriteStore] in { + def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), + "movbe{w}\t{$src, $dst|$dst, $src}", + [(store (bswap GR16:$src), addr:$dst)], IIC_MOVBE>, + OpSize, T8; + def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "movbe{l}\t{$src, $dst|$dst, $src}", + [(store (bswap GR32:$src), addr:$dst)], IIC_MOVBE>, + T8; + def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "movbe{q}\t{$src, $dst|$dst, $src}", + [(store (bswap GR64:$src), addr:$dst)], IIC_MOVBE>, + T8; + } +} + +//===----------------------------------------------------------------------===// +// RDRAND Instruction +// +let Predicates = [HasRDRAND], Defs = [EFLAGS] in { + def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins), + "rdrand{w}\t$dst", + [(set GR16:$dst, EFLAGS, (X86rdrand))]>, OpSize, TB; + def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins), + "rdrand{l}\t$dst", + [(set GR32:$dst, EFLAGS, (X86rdrand))]>, TB; + def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins), + "rdrand{q}\t$dst", + [(set GR64:$dst, EFLAGS, (X86rdrand))]>, TB; +} + +//===----------------------------------------------------------------------===// +// RDSEED Instruction +// +let Predicates = [HasRDSEED], Defs = [EFLAGS] in { + def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins), + "rdseed{w}\t$dst", + [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize, TB; + def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins), + "rdseed{l}\t$dst", + [(set GR32:$dst, EFLAGS, (X86rdseed))]>, TB; + def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins), + "rdseed{q}\t$dst", + [(set GR64:$dst, EFLAGS, (X86rdseed))]>, TB; +} + +//===----------------------------------------------------------------------===// +// LZCNT Instruction +// +let Predicates = [HasLZCNT], Defs = [EFLAGS] in { + def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "lzcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)]>, XS, + OpSize; + def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "lzcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (ctlz (loadi16 addr:$src))), + (implicit EFLAGS)]>, XS, OpSize; + + def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "lzcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>, XS; + def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "lzcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (ctlz (loadi32 addr:$src))), + (implicit EFLAGS)]>, XS; + + def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "lzcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)]>, + XS; + def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "lzcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (ctlz (loadi64 addr:$src))), + (implicit EFLAGS)]>, XS; +} + +//===----------------------------------------------------------------------===// +// BMI Instructions +// +let Predicates = [HasBMI], Defs = [EFLAGS] in { + def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "tzcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)]>, XS, + OpSize; + def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "tzcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (cttz (loadi16 addr:$src))), + (implicit EFLAGS)]>, XS, OpSize; + + def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "tzcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>, XS; + def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "tzcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (cttz (loadi32 addr:$src))), + (implicit EFLAGS)]>, XS; + + def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "tzcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)]>, + XS; + def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "tzcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (cttz (loadi64 addr:$src))), + (implicit EFLAGS)]>, XS; +} + +multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM, + RegisterClass RC, X86MemOperand x86memop, SDNode OpNode, + PatFrag ld_frag> { + def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src), + !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (OpNode RC:$src)), (implicit EFLAGS)]>, T8, VEX_4V; + def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src), + !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (OpNode (ld_frag addr:$src))), (implicit EFLAGS)]>, + T8, VEX_4V; +} + +let Predicates = [HasBMI], Defs = [EFLAGS] in { + defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem, + X86blsr, loadi32>; + defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem, + X86blsr, loadi64>, VEX_W; + defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem, + X86blsmsk, loadi32>; + defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem, + X86blsmsk, loadi64>, VEX_W; + defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem, + X86blsi, loadi32>; + defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem, + X86blsi, loadi64>, VEX_W; +} + +multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC, + X86MemOperand x86memop, Intrinsic Int, + PatFrag ld_frag> { + def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)]>, + T8, VEX_4VOp3; + def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, RC:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)), + (implicit EFLAGS)]>, T8, VEX_4VOp3; +} + +let Predicates = [HasBMI], Defs = [EFLAGS] in { + defm BEXTR32 : bmi_bextr_bzhi<0xF7, "bextr{l}", GR32, i32mem, + int_x86_bmi_bextr_32, loadi32>; + defm BEXTR64 : bmi_bextr_bzhi<0xF7, "bextr{q}", GR64, i64mem, + int_x86_bmi_bextr_64, loadi64>, VEX_W; +} + +let Predicates = [HasBMI2], Defs = [EFLAGS] in { + defm BZHI32 : bmi_bextr_bzhi<0xF5, "bzhi{l}", GR32, i32mem, + int_x86_bmi_bzhi_32, loadi32>; + defm BZHI64 : bmi_bextr_bzhi<0xF5, "bzhi{q}", GR64, i64mem, + int_x86_bmi_bzhi_64, loadi64>, VEX_W; +} + +multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC, + X86MemOperand x86memop, Intrinsic Int, + PatFrag ld_frag> { + def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (Int RC:$src1, RC:$src2))]>, + VEX_4V; + def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))]>, VEX_4V; +} + +let Predicates = [HasBMI2] in { + defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem, + int_x86_bmi_pdep_32, loadi32>, T8XD; + defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem, + int_x86_bmi_pdep_64, loadi64>, T8XD, VEX_W; + defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem, + int_x86_bmi_pext_32, loadi32>, T8XS; + defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem, + int_x86_bmi_pext_64, loadi64>, T8XS, VEX_W; +} + +//===----------------------------------------------------------------------===// +// Subsystems. +//===----------------------------------------------------------------------===// + +include "X86InstrArithmetic.td" +include "X86InstrCMovSetCC.td" +include "X86InstrExtension.td" +include "X86InstrControl.td" +include "X86InstrShiftRotate.td" + +// X87 Floating Point Stack. +include "X86InstrFPStack.td" + +// SIMD support (SSE, MMX and AVX) +include "X86InstrFragmentsSIMD.td" + +// FMA - Fused Multiply-Add support (requires FMA) +include "X86InstrFMA.td" + +// XOP +include "X86InstrXOP.td" + +// SSE, MMX and 3DNow! vector support. +include "X86InstrSSE.td" +include "X86InstrMMX.td" +include "X86Instr3DNow.td" + +include "X86InstrVMX.td" +include "X86InstrSVM.td" + +include "X86InstrTSX.td" + +// System instructions. +include "X86InstrSystem.td" + +// Compiler Pseudo Instructions and Pat Patterns +include "X86InstrCompiler.td" + +//===----------------------------------------------------------------------===// +// Assembler Mnemonic Aliases +//===----------------------------------------------------------------------===// + +def : MnemonicAlias<"call", "calll", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"call", "callq", "att">, Requires<[In64BitMode]>; + +def : MnemonicAlias<"cbw", "cbtw", "att">; +def : MnemonicAlias<"cwde", "cwtl", "att">; +def : MnemonicAlias<"cwd", "cwtd", "att">; +def : MnemonicAlias<"cdq", "cltd", "att">; +def : MnemonicAlias<"cdqe", "cltq", "att">; +def : MnemonicAlias<"cqo", "cqto", "att">; + +// lret maps to lretl, it is not ambiguous with lretq. +def : MnemonicAlias<"lret", "lretl", "att">; + +def : MnemonicAlias<"leavel", "leave", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>; + +def : MnemonicAlias<"loopz", "loope", "att">; +def : MnemonicAlias<"loopnz", "loopne", "att">; + +def : MnemonicAlias<"pop", "popl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"pop", "popq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"popf", "popfl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"popf", "popfq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"popfd", "popfl", "att">; + +// FIXME: This is wrong for "push reg". "push %bx" should turn into pushw in +// all modes. However: "push (addr)" and "push $42" should default to +// pushl/pushq depending on the current mode. Similar for "pop %bx" +def : MnemonicAlias<"push", "pushl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"push", "pushq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"pushf", "pushfl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"pushf", "pushfq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"pushfd", "pushfl", "att">; + +def : MnemonicAlias<"repe", "rep", "att">; +def : MnemonicAlias<"repz", "rep", "att">; +def : MnemonicAlias<"repnz", "repne", "att">; + +def : MnemonicAlias<"retl", "ret", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"retq", "ret", "att">, Requires<[In64BitMode]>; + +def : MnemonicAlias<"salb", "shlb", "att">; +def : MnemonicAlias<"salw", "shlw", "att">; +def : MnemonicAlias<"sall", "shll", "att">; +def : MnemonicAlias<"salq", "shlq", "att">; + +def : MnemonicAlias<"smovb", "movsb", "att">; +def : MnemonicAlias<"smovw", "movsw", "att">; +def : MnemonicAlias<"smovl", "movsl", "att">; +def : MnemonicAlias<"smovq", "movsq", "att">; + +def : MnemonicAlias<"ud2a", "ud2", "att">; +def : MnemonicAlias<"verrw", "verr", "att">; + +// System instruction aliases. +def : MnemonicAlias<"iret", "iretl", "att">; +def : MnemonicAlias<"sysret", "sysretl", "att">; +def : MnemonicAlias<"sysexit", "sysexitl", "att">; + +def : MnemonicAlias<"lgdtl", "lgdt", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"lgdtq", "lgdt", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"lidtl", "lidt", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"lidtq", "lidt", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"sgdtl", "sgdt", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"sgdtq", "sgdt", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"sidtl", "sidt", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"sidtq", "sidt", "att">, Requires<[In64BitMode]>; + + +// Floating point stack aliases. +def : MnemonicAlias<"fcmovz", "fcmove", "att">; +def : MnemonicAlias<"fcmova", "fcmovnbe", "att">; +def : MnemonicAlias<"fcmovnae", "fcmovb", "att">; +def : MnemonicAlias<"fcmovna", "fcmovbe", "att">; +def : MnemonicAlias<"fcmovae", "fcmovnb", "att">; +def : MnemonicAlias<"fcomip", "fcompi", "att">; +def : MnemonicAlias<"fildq", "fildll", "att">; +def : MnemonicAlias<"fistpq", "fistpll", "att">; +def : MnemonicAlias<"fisttpq", "fisttpll", "att">; +def : MnemonicAlias<"fldcww", "fldcw", "att">; +def : MnemonicAlias<"fnstcww", "fnstcw", "att">; +def : MnemonicAlias<"fnstsww", "fnstsw", "att">; +def : MnemonicAlias<"fucomip", "fucompi", "att">; +def : MnemonicAlias<"fwait", "wait", "att">; + + +class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond> + : MnemonicAlias<!strconcat(Prefix, OldCond, Suffix), + !strconcat(Prefix, NewCond, Suffix)>; + +/// IntegerCondCodeMnemonicAlias - This multiclass defines a bunch of +/// MnemonicAlias's that canonicalize the condition code in a mnemonic, for +/// example "setz" -> "sete". +multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix> { + def C : CondCodeAlias<Prefix, Suffix, "c", "b">; // setc -> setb + def Z : CondCodeAlias<Prefix, Suffix, "z" , "e">; // setz -> sete + def NA : CondCodeAlias<Prefix, Suffix, "na", "be">; // setna -> setbe + def NB : CondCodeAlias<Prefix, Suffix, "nb", "ae">; // setnb -> setae + def NC : CondCodeAlias<Prefix, Suffix, "nc", "ae">; // setnc -> setae + def NG : CondCodeAlias<Prefix, Suffix, "ng", "le">; // setng -> setle + def NL : CondCodeAlias<Prefix, Suffix, "nl", "ge">; // setnl -> setge + def NZ : CondCodeAlias<Prefix, Suffix, "nz", "ne">; // setnz -> setne + def PE : CondCodeAlias<Prefix, Suffix, "pe", "p">; // setpe -> setp + def PO : CondCodeAlias<Prefix, Suffix, "po", "np">; // setpo -> setnp + + def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b">; // setnae -> setb + def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a">; // setnbe -> seta + def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l">; // setnge -> setl + def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g">; // setnle -> setg +} + +// Aliases for set<CC> +defm : IntegerCondCodeMnemonicAlias<"set", "">; +// Aliases for j<CC> +defm : IntegerCondCodeMnemonicAlias<"j", "">; +// Aliases for cmov<CC>{w,l,q} +defm : IntegerCondCodeMnemonicAlias<"cmov", "w">; +defm : IntegerCondCodeMnemonicAlias<"cmov", "l">; +defm : IntegerCondCodeMnemonicAlias<"cmov", "q">; + + +//===----------------------------------------------------------------------===// +// Assembler Instruction Aliases +//===----------------------------------------------------------------------===// + +// aad/aam default to base 10 if no operand is specified. +def : InstAlias<"aad", (AAD8i8 10)>; +def : InstAlias<"aam", (AAM8i8 10)>; + +// Disambiguate the mem/imm form of bt-without-a-suffix as btl. +def : InstAlias<"bt $imm, $mem", (BT32mi8 i32mem:$mem, i32i8imm:$imm)>; + +// clr aliases. +def : InstAlias<"clrb $reg", (XOR8rr GR8 :$reg, GR8 :$reg)>; +def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg)>; +def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg)>; +def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg)>; + +// div and idiv aliases for explicit A register. +def : InstAlias<"divb $src, %al", (DIV8r GR8 :$src)>; +def : InstAlias<"divw $src, %ax", (DIV16r GR16:$src)>; +def : InstAlias<"divl $src, %eax", (DIV32r GR32:$src)>; +def : InstAlias<"divq $src, %rax", (DIV64r GR64:$src)>; +def : InstAlias<"divb $src, %al", (DIV8m i8mem :$src)>; +def : InstAlias<"divw $src, %ax", (DIV16m i16mem:$src)>; +def : InstAlias<"divl $src, %eax", (DIV32m i32mem:$src)>; +def : InstAlias<"divq $src, %rax", (DIV64m i64mem:$src)>; +def : InstAlias<"idivb $src, %al", (IDIV8r GR8 :$src)>; +def : InstAlias<"idivw $src, %ax", (IDIV16r GR16:$src)>; +def : InstAlias<"idivl $src, %eax", (IDIV32r GR32:$src)>; +def : InstAlias<"idivq $src, %rax", (IDIV64r GR64:$src)>; +def : InstAlias<"idivb $src, %al", (IDIV8m i8mem :$src)>; +def : InstAlias<"idivw $src, %ax", (IDIV16m i16mem:$src)>; +def : InstAlias<"idivl $src, %eax", (IDIV32m i32mem:$src)>; +def : InstAlias<"idivq $src, %rax", (IDIV64m i64mem:$src)>; + + + +// Various unary fpstack operations default to operating on on ST1. +// For example, "fxch" -> "fxch %st(1)" +def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>; +def : InstAlias<"fsubp", (SUBR_FPrST0 ST1)>; +def : InstAlias<"fsubrp", (SUB_FPrST0 ST1)>; +def : InstAlias<"fmulp", (MUL_FPrST0 ST1)>; +def : InstAlias<"fdivp", (DIVR_FPrST0 ST1)>; +def : InstAlias<"fdivrp", (DIV_FPrST0 ST1)>; +def : InstAlias<"fxch", (XCH_F ST1)>; +def : InstAlias<"fcom", (COM_FST0r ST1)>; +def : InstAlias<"fcomp", (COMP_FST0r ST1)>; +def : InstAlias<"fcomi", (COM_FIr ST1)>; +def : InstAlias<"fcompi", (COM_FIPr ST1)>; +def : InstAlias<"fucom", (UCOM_Fr ST1)>; +def : InstAlias<"fucomp", (UCOM_FPr ST1)>; +def : InstAlias<"fucomi", (UCOM_FIr ST1)>; +def : InstAlias<"fucompi", (UCOM_FIPr ST1)>; + +// Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op. +// For example, "fadd %st(4), %st(0)" -> "fadd %st(4)". We also disambiguate +// instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with +// gas. +multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> { + def : InstAlias<!strconcat(Mnemonic, " $op, %st(0)"), + (Inst RST:$op), EmitAlias>; + def : InstAlias<!strconcat(Mnemonic, " %st(0), %st(0)"), + (Inst ST0), EmitAlias>; +} + +defm : FpUnaryAlias<"fadd", ADD_FST0r>; +defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>; +defm : FpUnaryAlias<"fsub", SUB_FST0r>; +defm : FpUnaryAlias<"fsubp", SUBR_FPrST0>; +defm : FpUnaryAlias<"fsubr", SUBR_FST0r>; +defm : FpUnaryAlias<"fsubrp", SUB_FPrST0>; +defm : FpUnaryAlias<"fmul", MUL_FST0r>; +defm : FpUnaryAlias<"fmulp", MUL_FPrST0>; +defm : FpUnaryAlias<"fdiv", DIV_FST0r>; +defm : FpUnaryAlias<"fdivp", DIVR_FPrST0>; +defm : FpUnaryAlias<"fdivr", DIVR_FST0r>; +defm : FpUnaryAlias<"fdivrp", DIV_FPrST0>; +defm : FpUnaryAlias<"fcomi", COM_FIr, 0>; +defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>; +defm : FpUnaryAlias<"fcompi", COM_FIPr>; +defm : FpUnaryAlias<"fucompi", UCOM_FIPr>; + + +// Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they +// commute. We also allow fdiv[r]p/fsubrp even though they don't commute, +// solely because gas supports it. +def : InstAlias<"faddp %st(0), $op", (ADD_FPrST0 RST:$op), 0>; +def : InstAlias<"fmulp %st(0), $op", (MUL_FPrST0 RST:$op)>; +def : InstAlias<"fsubp %st(0), $op", (SUBR_FPrST0 RST:$op)>; +def : InstAlias<"fsubrp %st(0), $op", (SUB_FPrST0 RST:$op)>; +def : InstAlias<"fdivp %st(0), $op", (DIVR_FPrST0 RST:$op)>; +def : InstAlias<"fdivrp %st(0), $op", (DIV_FPrST0 RST:$op)>; + +// We accept "fnstsw %eax" even though it only writes %ax. +def : InstAlias<"fnstsw %eax", (FNSTSW16r)>; +def : InstAlias<"fnstsw %al" , (FNSTSW16r)>; +def : InstAlias<"fnstsw" , (FNSTSW16r)>; + +// lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but +// this is compatible with what GAS does. +def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>; +def : InstAlias<"ljmp $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>; +def : InstAlias<"lcall *$dst", (FARCALL32m opaque48mem:$dst)>; +def : InstAlias<"ljmp *$dst", (FARJMP32m opaque48mem:$dst)>; + +// "imul <imm>, B" is an alias for "imul <imm>, B, B". +def : InstAlias<"imulw $imm, $r", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm)>; +def : InstAlias<"imulw $imm, $r", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm)>; +def : InstAlias<"imull $imm, $r", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm)>; +def : InstAlias<"imull $imm, $r", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm)>; +def : InstAlias<"imulq $imm, $r",(IMUL64rri32 GR64:$r, GR64:$r,i64i32imm:$imm)>; +def : InstAlias<"imulq $imm, $r", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm)>; + +// inb %dx -> inb %al, %dx +def : InstAlias<"inb %dx", (IN8rr)>; +def : InstAlias<"inw %dx", (IN16rr)>; +def : InstAlias<"inl %dx", (IN32rr)>; +def : InstAlias<"inb $port", (IN8ri i8imm:$port)>; +def : InstAlias<"inw $port", (IN16ri i8imm:$port)>; +def : InstAlias<"inl $port", (IN32ri i8imm:$port)>; + + +// jmp and call aliases for lcall and ljmp. jmp $42,$5 -> ljmp +def : InstAlias<"call $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>; +def : InstAlias<"jmp $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>; +def : InstAlias<"callw $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>; +def : InstAlias<"jmpw $seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>; +def : InstAlias<"calll $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>; +def : InstAlias<"jmpl $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>; + +// Force mov without a suffix with a segment and mem to prefer the 'l' form of +// the move. All segment/mem forms are equivalent, this has the shortest +// encoding. +def : InstAlias<"mov $mem, $seg", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem)>; +def : InstAlias<"mov $seg, $mem", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg)>; + +// Match 'movq <largeimm>, <reg>' as an alias for movabsq. +def : InstAlias<"movq $imm, $reg", (MOV64ri GR64:$reg, i64imm:$imm)>; + +// Match 'movq GR64, MMX' as an alias for movd. +def : InstAlias<"movq $src, $dst", + (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>; +def : InstAlias<"movq $src, $dst", + (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>; + +// movsd with no operands (as opposed to the SSE scalar move of a double) is an +// alias for movsl. (as in rep; movsd) +def : InstAlias<"movsd", (MOVSD)>; + +// movsx aliases +def : InstAlias<"movsx $src, $dst", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>; + +// movzx aliases +def : InstAlias<"movzx $src, $dst", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>; +// Note: No GR32->GR64 movzx form. + +// outb %dx -> outb %al, %dx +def : InstAlias<"outb %dx", (OUT8rr)>; +def : InstAlias<"outw %dx", (OUT16rr)>; +def : InstAlias<"outl %dx", (OUT32rr)>; +def : InstAlias<"outb $port", (OUT8ir i8imm:$port)>; +def : InstAlias<"outw $port", (OUT16ir i8imm:$port)>; +def : InstAlias<"outl $port", (OUT32ir i8imm:$port)>; + +// 'sldt <mem>' can be encoded with either sldtw or sldtq with the same +// effect (both store to a 16-bit mem). Force to sldtw to avoid ambiguity +// errors, since its encoding is the most compact. +def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem)>; + +// shld/shrd op,op -> shld op, op, CL +def : InstAlias<"shldw $r2, $r1", (SHLD16rrCL GR16:$r1, GR16:$r2)>; +def : InstAlias<"shldl $r2, $r1", (SHLD32rrCL GR32:$r1, GR32:$r2)>; +def : InstAlias<"shldq $r2, $r1", (SHLD64rrCL GR64:$r1, GR64:$r2)>; +def : InstAlias<"shrdw $r2, $r1", (SHRD16rrCL GR16:$r1, GR16:$r2)>; +def : InstAlias<"shrdl $r2, $r1", (SHRD32rrCL GR32:$r1, GR32:$r2)>; +def : InstAlias<"shrdq $r2, $r1", (SHRD64rrCL GR64:$r1, GR64:$r2)>; + +def : InstAlias<"shldw $reg, $mem", (SHLD16mrCL i16mem:$mem, GR16:$reg)>; +def : InstAlias<"shldl $reg, $mem", (SHLD32mrCL i32mem:$mem, GR32:$reg)>; +def : InstAlias<"shldq $reg, $mem", (SHLD64mrCL i64mem:$mem, GR64:$reg)>; +def : InstAlias<"shrdw $reg, $mem", (SHRD16mrCL i16mem:$mem, GR16:$reg)>; +def : InstAlias<"shrdl $reg, $mem", (SHRD32mrCL i32mem:$mem, GR32:$reg)>; +def : InstAlias<"shrdq $reg, $mem", (SHRD64mrCL i64mem:$mem, GR64:$reg)>; + +/* FIXME: This is disabled because the asm matcher is currently incapable of + * matching a fixed immediate like $1. +// "shl X, $1" is an alias for "shl X". +multiclass ShiftRotateByOneAlias<string Mnemonic, string Opc> { + def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "8r1")) GR8:$op)>; + def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "16r1")) GR16:$op)>; + def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "32r1")) GR32:$op)>; + def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "64r1")) GR64:$op)>; + def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "8m1")) i8mem:$op)>; + def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "16m1")) i16mem:$op)>; + def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "32m1")) i32mem:$op)>; + def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "64m1")) i64mem:$op)>; +} + +defm : ShiftRotateByOneAlias<"rcl", "RCL">; +defm : ShiftRotateByOneAlias<"rcr", "RCR">; +defm : ShiftRotateByOneAlias<"rol", "ROL">; +defm : ShiftRotateByOneAlias<"ror", "ROR">; +FIXME */ + +// test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms. +def : InstAlias<"testb $val, $mem", (TEST8rm GR8 :$val, i8mem :$mem)>; +def : InstAlias<"testw $val, $mem", (TEST16rm GR16:$val, i16mem:$mem)>; +def : InstAlias<"testl $val, $mem", (TEST32rm GR32:$val, i32mem:$mem)>; +def : InstAlias<"testq $val, $mem", (TEST64rm GR64:$val, i64mem:$mem)>; + +// xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms. +def : InstAlias<"xchgb $mem, $val", (XCHG8rm GR8 :$val, i8mem :$mem)>; +def : InstAlias<"xchgw $mem, $val", (XCHG16rm GR16:$val, i16mem:$mem)>; +def : InstAlias<"xchgl $mem, $val", (XCHG32rm GR32:$val, i32mem:$mem)>; +def : InstAlias<"xchgq $mem, $val", (XCHG64rm GR64:$val, i64mem:$mem)>; + +// xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms. +def : InstAlias<"xchgw %ax, $src", (XCHG16ar GR16:$src)>; +def : InstAlias<"xchgl %eax, $src", (XCHG32ar GR32:$src)>, Requires<[In32BitMode]>; +def : InstAlias<"xchgl %eax, $src", (XCHG32ar64 GR32_NOAX:$src)>, Requires<[In64BitMode]>; +def : InstAlias<"xchgq %rax, $src", (XCHG64ar GR64:$src)>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm/lib/Target/X86/X86InstrMMX.td new file mode 100644 index 0000000..49721df --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrMMX.td @@ -0,0 +1,624 @@ +//===-- X86InstrMMX.td - Describe the MMX Instruction Set --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 MMX instruction set, defining the instructions, +// and properties of the instructions which are needed for code generation, +// machine code emission, and analysis. +// +// All instructions that use MMX should be in this file, even if they also use +// SSE. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// MMX Multiclasses +//===----------------------------------------------------------------------===// + +let Sched = WriteVecALU in { +def MMX_INTALU_ITINS : OpndItins< + IIC_MMX_ALU_RR, IIC_MMX_ALU_RM +>; + +def MMX_INTALUQ_ITINS : OpndItins< + IIC_MMX_ALUQ_RR, IIC_MMX_ALUQ_RM +>; + +def MMX_PHADDSUBW : OpndItins< + IIC_MMX_PHADDSUBW_RR, IIC_MMX_PHADDSUBW_RM +>; + +def MMX_PHADDSUBD : OpndItins< + IIC_MMX_PHADDSUBD_RR, IIC_MMX_PHADDSUBD_RM +>; +} + +let Sched = WriteVecIMul in +def MMX_PMUL_ITINS : OpndItins< + IIC_MMX_PMUL, IIC_MMX_PMUL +>; + +let Sched = WriteVecALU in { +def MMX_PSADBW_ITINS : OpndItins< + IIC_MMX_PSADBW, IIC_MMX_PSADBW +>; + +def MMX_MISC_FUNC_ITINS : OpndItins< + IIC_MMX_MISC_FUNC_MEM, IIC_MMX_MISC_FUNC_REG +>; +} + +def MMX_SHIFT_ITINS : ShiftOpndItins< + IIC_MMX_SHIFT_RR, IIC_MMX_SHIFT_RM, IIC_MMX_SHIFT_RI +>; + +let Sched = WriteShuffle in { +def MMX_UNPCK_H_ITINS : OpndItins< + IIC_MMX_UNPCK_H_RR, IIC_MMX_UNPCK_H_RM +>; + +def MMX_UNPCK_L_ITINS : OpndItins< + IIC_MMX_UNPCK_L, IIC_MMX_UNPCK_L +>; + +def MMX_PCK_ITINS : OpndItins< + IIC_MMX_PCK_RR, IIC_MMX_PCK_RM +>; + +def MMX_PSHUF_ITINS : OpndItins< + IIC_MMX_PSHUF, IIC_MMX_PSHUF +>; +} // Sched + +let Sched = WriteCvtF2I in { +def MMX_CVT_PD_ITINS : OpndItins< + IIC_MMX_CVT_PD_RR, IIC_MMX_CVT_PD_RM +>; + +def MMX_CVT_PS_ITINS : OpndItins< + IIC_MMX_CVT_PS_RR, IIC_MMX_CVT_PS_RM +>; +} + +let Constraints = "$src1 = $dst" in { + // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic. + // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp. + multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId, + OpndItins itins, bit Commutable = 0> { + def irr : MMXI<opc, MRMSrcReg, (outs VR64:$dst), + (ins VR64:$src1, VR64:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>, + Sched<[itins.Sched]> { + let isCommutable = Commutable; + } + def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst), + (ins VR64:$src1, i64mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId VR64:$src1, + (bitconvert (load_mmx addr:$src2))))], + itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; + } + + multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm, + string OpcodeStr, Intrinsic IntId, + Intrinsic IntId2, ShiftOpndItins itins> { + def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst), + (ins VR64:$src1, VR64:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>, + Sched<[WriteVecShift]>; + def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst), + (ins VR64:$src1, i64mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId VR64:$src1, + (bitconvert (load_mmx addr:$src2))))], + itins.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>; + def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst), + (ins VR64:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))], itins.ri>, + Sched<[WriteVecShift]>; + } +} + +/// Unary MMX instructions requiring SSSE3. +multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr, + Intrinsic IntId64, OpndItins itins> { + def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>, + Sched<[itins.Sched]>; + + def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR64:$dst, + (IntId64 (bitconvert (memopmmx addr:$src))))], + itins.rm>, Sched<[itins.Sched.Folded]>; +} + +/// Binary MMX instructions requiring SSSE3. +let ImmT = NoImm, Constraints = "$src1 = $dst" in { +multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr, + Intrinsic IntId64, OpndItins itins> { + let isCommutable = 0 in + def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), + (ins VR64:$src1, VR64:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>, + Sched<[itins.Sched]>; + def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), + (ins VR64:$src1, i64mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, + (IntId64 VR64:$src1, + (bitconvert (memopmmx addr:$src2))))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} +} + +/// PALIGN MMX instructions (require SSSE3). +multiclass ssse3_palign_mm<string asm, Intrinsic IntId> { + def R64irr : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst), + (ins VR64:$src1, VR64:$src2, i8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>; + def R64irm : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst), + (ins VR64:$src1, i64mem:$src2, i8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR64:$dst, (IntId VR64:$src1, + (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>; +} + +multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, + Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag, + string asm, OpndItins itins, Domain d> { + def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, + [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr, d>, + Sched<[itins.Sched]>; + def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, + [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm, d>, + Sched<[itins.Sched.Folded]>; +} + +multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC, + RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop, + PatFrag ld_frag, string asm, Domain d> { + def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst),(ins DstRC:$src1, SrcRC:$src2), + asm, [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], + NoItinerary, d>; + def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst), + (ins DstRC:$src1, x86memop:$src2), asm, + [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], + NoItinerary, d>; +} + +//===----------------------------------------------------------------------===// +// MMX EMMS Instruction +//===----------------------------------------------------------------------===// + +def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms", + [(int_x86_mmx_emms)]>; + +//===----------------------------------------------------------------------===// +// MMX Scalar Instructions +//===----------------------------------------------------------------------===// + +// Data Transfer Instructions +def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, + (x86mmx (scalar_to_vector GR32:$src)))], + IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>; +let canFoldAsLoad = 1 in +def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, + (x86mmx (scalar_to_vector (loadi32 addr:$src))))], + IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>; +let mayStore = 1 in +def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src), + "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>, + Sched<[WriteStore]>; + +// Low word of MMX to GPR. +def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1, + [SDTCisVT<0, i32>, SDTCisVT<1, x86mmx>]>>; +def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, + (MMX_X86movd2w (x86mmx VR64:$src)))], + IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>; + +let neverHasSideEffects = 1 in +def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src), + "movd\t{$src, $dst|$dst, $src}", + [], IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>; + +// These are 64 bit moves, but since the OS X assembler doesn't +// recognize a register-register movq, we write them as +// movd. +let SchedRW = [WriteMove] in { +def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg, + (outs GR64:$dst), (ins VR64:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, + (bitconvert VR64:$src))], IIC_MMX_MOV_REG_MM>; +def MMX_MOVD64rrv164 : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, + (bitconvert GR64:$src))], IIC_MMX_MOV_MM_RM>; +let neverHasSideEffects = 1 in +def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src), + "movq\t{$src, $dst|$dst, $src}", [], + IIC_MMX_MOVQ_RR>; +} // SchedRW + +let SchedRW = [WriteLoad] in { +let canFoldAsLoad = 1 in +def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, (load_mmx addr:$src))], + IIC_MMX_MOVQ_RM>; +def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(store (x86mmx VR64:$src), addr:$dst)], + IIC_MMX_MOVQ_RM>; +} // SchedRW + +let SchedRW = [WriteMove] in { +def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), + (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, + (x86mmx (bitconvert + (i64 (vector_extract (v2i64 VR128:$src), + (iPTR 0))))))], + IIC_MMX_MOVQ_RR>; + +def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst), + (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 + (scalar_to_vector + (i64 (bitconvert (x86mmx VR64:$src))))))], + IIC_MMX_MOVQ_RR>; + +let neverHasSideEffects = 1 in +def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst), + (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", + [], IIC_MMX_MOVQ_RR>; + +def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), + (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}", + [], IIC_MMX_MOVQ_RR>; +} // SchedRW + +def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), + "movntq\t{$src, $dst|$dst, $src}", + [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)], + IIC_MMX_MOVQ_RM>, Sched<[WriteStore]>; + +let AddedComplexity = 15 in +// movd to MMX register zero-extends +def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, + (x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))))], + IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>; +let AddedComplexity = 20 in +def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), + (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, + (x86mmx (X86vzmovl (x86mmx + (scalar_to_vector (loadi32 addr:$src))))))], + IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>; + +// Arithmetic Instructions +defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b, + MMX_INTALU_ITINS>; +defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w, + MMX_INTALU_ITINS>; +defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d, + MMX_INTALU_ITINS>; +// -- Addition +defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b, + MMX_INTALU_ITINS, 1>; +defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w, + MMX_INTALU_ITINS, 1>; +defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d, + MMX_INTALU_ITINS, 1>; +defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q, + MMX_INTALUQ_ITINS, 1>; +defm MMX_PADDSB : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b, + MMX_INTALU_ITINS, 1>; +defm MMX_PADDSW : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w, + MMX_INTALU_ITINS, 1>; + +defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b, + MMX_INTALU_ITINS, 1>; +defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w, + MMX_INTALU_ITINS, 1>; + +defm MMX_PHADDW : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w, + MMX_PHADDSUBW>; +defm MMX_PHADD : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d, + MMX_PHADDSUBD>; +defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw, + MMX_PHADDSUBW>; + + +// -- Subtraction +defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b, + MMX_INTALU_ITINS>; +defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w, + MMX_INTALU_ITINS, 1>; +defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d, + MMX_INTALU_ITINS, 1>; +defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q, + MMX_INTALUQ_ITINS, 1>; + +defm MMX_PSUBSB : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b, + MMX_INTALU_ITINS, 1>; +defm MMX_PSUBSW : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w, + MMX_INTALU_ITINS, 1>; + +defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b, + MMX_INTALU_ITINS, 1>; +defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w, + MMX_INTALU_ITINS, 1>; + +defm MMX_PHSUBW : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w, + MMX_PHADDSUBW>; +defm MMX_PHSUBD : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d, + MMX_PHADDSUBD>; +defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw, + MMX_PHADDSUBW>; + +// -- Multiplication +defm MMX_PMULLW : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w, + MMX_PMUL_ITINS, 1>; + +defm MMX_PMULHW : MMXI_binop_rm_int<0xE5, "pmulhw", int_x86_mmx_pmulh_w, + MMX_PMUL_ITINS, 1>; +defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w, + MMX_PMUL_ITINS, 1>; +defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq, + MMX_PMUL_ITINS, 1>; +let isCommutable = 1 in +defm MMX_PMULHRSW : SS3I_binop_rm_int_mm<0x0B, "pmulhrsw", + int_x86_ssse3_pmul_hr_sw, MMX_PMUL_ITINS>; + +// -- Miscellanea +defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, + MMX_PMUL_ITINS, 1>; + +defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw", + int_x86_ssse3_pmadd_ub_sw, MMX_PMUL_ITINS>; +defm MMX_PAVGB : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b, + MMX_MISC_FUNC_ITINS, 1>; +defm MMX_PAVGW : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w, + MMX_MISC_FUNC_ITINS, 1>; + +defm MMX_PMINUB : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b, + MMX_MISC_FUNC_ITINS, 1>; +defm MMX_PMINSW : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w, + MMX_MISC_FUNC_ITINS, 1>; + +defm MMX_PMAXUB : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b, + MMX_MISC_FUNC_ITINS, 1>; +defm MMX_PMAXSW : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w, + MMX_MISC_FUNC_ITINS, 1>; + +defm MMX_PSADBW : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw, + MMX_PSADBW_ITINS, 1>; + +defm MMX_PSIGNB : SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b, + MMX_MISC_FUNC_ITINS>; +defm MMX_PSIGNW : SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w, + MMX_MISC_FUNC_ITINS>; +defm MMX_PSIGND : SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d, + MMX_MISC_FUNC_ITINS>; +let Constraints = "$src1 = $dst" in + defm MMX_PALIGN : ssse3_palign_mm<"palignr", int_x86_mmx_palignr_b>; + +// Logical Instructions +defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand, + MMX_INTALU_ITINS, 1>; +defm MMX_POR : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por, + MMX_INTALU_ITINS, 1>; +defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor, + MMX_INTALU_ITINS, 1>; +defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn, + MMX_INTALU_ITINS>; + +// Shift Instructions +defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw", + int_x86_mmx_psrl_w, int_x86_mmx_psrli_w, + MMX_SHIFT_ITINS>; +defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld", + int_x86_mmx_psrl_d, int_x86_mmx_psrli_d, + MMX_SHIFT_ITINS>; +defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq", + int_x86_mmx_psrl_q, int_x86_mmx_psrli_q, + MMX_SHIFT_ITINS>; + +defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw", + int_x86_mmx_psll_w, int_x86_mmx_pslli_w, + MMX_SHIFT_ITINS>; +defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld", + int_x86_mmx_psll_d, int_x86_mmx_pslli_d, + MMX_SHIFT_ITINS>; +defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq", + int_x86_mmx_psll_q, int_x86_mmx_pslli_q, + MMX_SHIFT_ITINS>; + +defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw", + int_x86_mmx_psra_w, int_x86_mmx_psrai_w, + MMX_SHIFT_ITINS>; +defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad", + int_x86_mmx_psra_d, int_x86_mmx_psrai_d, + MMX_SHIFT_ITINS>; + +// Comparison Instructions +defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b, + MMX_INTALU_ITINS>; +defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w, + MMX_INTALU_ITINS>; +defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d, + MMX_INTALU_ITINS>; + +defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b, + MMX_INTALU_ITINS>; +defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w, + MMX_INTALU_ITINS>; +defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d, + MMX_INTALU_ITINS>; + +// -- Unpack Instructions +defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw", + int_x86_mmx_punpckhbw, + MMX_UNPCK_H_ITINS>; +defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd", + int_x86_mmx_punpckhwd, + MMX_UNPCK_H_ITINS>; +defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq", + int_x86_mmx_punpckhdq, + MMX_UNPCK_H_ITINS>; +defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw", + int_x86_mmx_punpcklbw, + MMX_UNPCK_L_ITINS>; +defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd", + int_x86_mmx_punpcklwd, + MMX_UNPCK_L_ITINS>; +defm MMX_PUNPCKLDQ : MMXI_binop_rm_int<0x62, "punpckldq", + int_x86_mmx_punpckldq, + MMX_UNPCK_L_ITINS>; + +// -- Pack Instructions +defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb, + MMX_PCK_ITINS>; +defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw, + MMX_PCK_ITINS>; +defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb, + MMX_PCK_ITINS>; + +// -- Shuffle Instructions +defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b, + MMX_PSHUF_ITINS>; + +def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg, + (outs VR64:$dst), (ins VR64:$src1, i8imm:$src2), + "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR64:$dst, + (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))], + IIC_MMX_PSHUF>, Sched<[WriteShuffle]>; +def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem, + (outs VR64:$dst), (ins i64mem:$src1, i8imm:$src2), + "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR64:$dst, + (int_x86_sse_pshuf_w (load_mmx addr:$src1), + imm:$src2))], + IIC_MMX_PSHUF>, Sched<[WriteShuffleLd]>; + + + + +// -- Conversion Instructions +defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi, + f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}", + MMX_CVT_PS_ITINS, SSEPackedSingle>, TB; +defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi, + f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}", + MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize; +defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi, + f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}", + MMX_CVT_PS_ITINS, SSEPackedSingle>, TB; +defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi, + f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}", + MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize; +defm MMX_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd, + i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}", + MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize; +let Constraints = "$src1 = $dst" in { + defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128, + int_x86_sse_cvtpi2ps, + i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}", + SSEPackedSingle>, TB; +} + +// Extract / Insert +def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg, + (outs GR32:$dst), (ins VR64:$src1, i32i8imm:$src2), + "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, (int_x86_mmx_pextr_w VR64:$src1, + (iPTR imm:$src2)))], + IIC_MMX_PEXTR>, Sched<[WriteShuffle]>; +let Constraints = "$src1 = $dst" in { + def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg, + (outs VR64:$dst), + (ins VR64:$src1, GR32:$src2, i32i8imm:$src3), + "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, + GR32:$src2, (iPTR imm:$src3)))], + IIC_MMX_PINSRW>, Sched<[WriteShuffle]>; + + def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem, + (outs VR64:$dst), + (ins VR64:$src1, i16mem:$src2, i32i8imm:$src3), + "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, + (i32 (anyext (loadi16 addr:$src2))), + (iPTR imm:$src3)))], + IIC_MMX_PINSRW>, Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +// Mask creation +def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR64:$src), + "pmovmskb\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, + (int_x86_mmx_pmovmskb VR64:$src))]>; + + +// Low word of XMM to MMX. +def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1, + [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>; + +def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)), + (x86mmx (MMX_MOVDQ2Qrr VR128:$src))>; + +def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))), + (x86mmx (MMX_MOVQ64rm addr:$src))>; + +// Misc. +let SchedRW = [WriteShuffle] in { +let Uses = [EDI] in +def MMX_MASKMOVQ : MMXI<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), + "maskmovq\t{$mask, $src|$src, $mask}", + [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)], + IIC_MMX_MASKMOV>; +let Uses = [RDI] in +def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), + "maskmovq\t{$mask, $src|$src, $mask}", + [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)], + IIC_MMX_MASKMOV>; +} + +// 64-bit bit convert. +let Predicates = [HasSSE2] in { +def : Pat<(x86mmx (bitconvert (i64 GR64:$src))), + (MMX_MOVD64to64rr GR64:$src)>; +def : Pat<(i64 (bitconvert (x86mmx VR64:$src))), + (MMX_MOVD64from64rr VR64:$src)>; +def : Pat<(f64 (bitconvert (x86mmx VR64:$src))), + (MMX_MOVQ2FR64rr VR64:$src)>; +def : Pat<(x86mmx (bitconvert (f64 FR64:$src))), + (MMX_MOVFR642Qrr FR64:$src)>; +} + + diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td new file mode 100644 index 0000000..cce938b --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td @@ -0,0 +1,8379 @@ +//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 SSE instruction set, defining the instructions, +// and properties of the instructions which are needed for code generation, +// machine code emission, and analysis. +// +//===----------------------------------------------------------------------===// + +class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> { + InstrItinClass rr = arg_rr; + InstrItinClass rm = arg_rm; + // InstrSchedModel info. + X86FoldableSchedWrite Sched = WriteFAdd; +} + +class SizeItins<OpndItins arg_s, OpndItins arg_d> { + OpndItins s = arg_s; + OpndItins d = arg_d; +} + + +class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm, + InstrItinClass arg_ri> { + InstrItinClass rr = arg_rr; + InstrItinClass rm = arg_rm; + InstrItinClass ri = arg_ri; +} + + +// scalar +let Sched = WriteFAdd in { +def SSE_ALU_F32S : OpndItins< + IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM +>; + +def SSE_ALU_F64S : OpndItins< + IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM +>; +} + +def SSE_ALU_ITINS_S : SizeItins< + SSE_ALU_F32S, SSE_ALU_F64S +>; + +let Sched = WriteFMul in { +def SSE_MUL_F32S : OpndItins< + IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM +>; + +def SSE_MUL_F64S : OpndItins< + IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM +>; +} + +def SSE_MUL_ITINS_S : SizeItins< + SSE_MUL_F32S, SSE_MUL_F64S +>; + +let Sched = WriteFDiv in { +def SSE_DIV_F32S : OpndItins< + IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM +>; + +def SSE_DIV_F64S : OpndItins< + IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM +>; +} + +def SSE_DIV_ITINS_S : SizeItins< + SSE_DIV_F32S, SSE_DIV_F64S +>; + +// parallel +let Sched = WriteFAdd in { +def SSE_ALU_F32P : OpndItins< + IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM +>; + +def SSE_ALU_F64P : OpndItins< + IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM +>; +} + +def SSE_ALU_ITINS_P : SizeItins< + SSE_ALU_F32P, SSE_ALU_F64P +>; + +let Sched = WriteFMul in { +def SSE_MUL_F32P : OpndItins< + IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM +>; + +def SSE_MUL_F64P : OpndItins< + IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM +>; +} + +def SSE_MUL_ITINS_P : SizeItins< + SSE_MUL_F32P, SSE_MUL_F64P +>; + +let Sched = WriteFDiv in { +def SSE_DIV_F32P : OpndItins< + IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM +>; + +def SSE_DIV_F64P : OpndItins< + IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM +>; +} + +def SSE_DIV_ITINS_P : SizeItins< + SSE_DIV_F32P, SSE_DIV_F64P +>; + +def SSE_BIT_ITINS_P : OpndItins< + IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM +>; + +let Sched = WriteVecALU in { +def SSE_INTALU_ITINS_P : OpndItins< + IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM +>; + +def SSE_INTALUQ_ITINS_P : OpndItins< + IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM +>; +} + +let Sched = WriteVecIMul in +def SSE_INTMUL_ITINS_P : OpndItins< + IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM +>; + +def SSE_INTSHIFT_ITINS_P : ShiftOpndItins< + IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI +>; + +def SSE_MOVA_ITINS : OpndItins< + IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM +>; + +def SSE_MOVU_ITINS : OpndItins< + IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM +>; + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 Instructions Classes +//===----------------------------------------------------------------------===// + +/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class +multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, + RegisterClass RC, X86MemOperand x86memop, + OpndItins itins, + bit Is2Addr = 1> { + let isCommutable = 1 in { + def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr>, + Sched<[itins.Sched]>; + } + def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class +multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC, + string asm, string SSEVer, string FPSizeStr, + Operand memopr, ComplexPattern mem_cpat, + OpndItins itins, + bit Is2Addr = 1> { + def rr_Int : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr)) + RC:$src1, RC:$src2))], itins.rr>, + Sched<[itins.Sched]>; + def rm_Int : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2), + !if(Is2Addr, + !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse", + SSEVer, "_", OpcodeStr, FPSizeStr)) + RC:$src1, mem_cpat:$src2))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +/// sse12_fp_packed - SSE 1 & 2 packed instructions class +multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, + RegisterClass RC, ValueType vt, + X86MemOperand x86memop, PatFrag mem_frag, + Domain d, OpndItins itins, bit Is2Addr = 1> { + let isCommutable = 1 in + def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>, + Sched<[itins.Sched]>; + let mayLoad = 1 in + def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], + itins.rm, d>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class +multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, + string OpcodeStr, X86MemOperand x86memop, + list<dag> pat_rr, list<dag> pat_rm, + bit Is2Addr = 1> { + let isCommutable = 1, hasSideEffects = 0 in + def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + pat_rr, NoItinerary, d>, + Sched<[WriteVecLogic]>; + def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + pat_rm, NoItinerary, d>, + Sched<[WriteVecLogicLd, ReadAfterLd]>; +} + +//===----------------------------------------------------------------------===// +// Non-instruction patterns +//===----------------------------------------------------------------------===// + +// A vector extract of the first f32/f64 position is a subregister copy +def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), + (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>; +def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), + (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>; + +// A 128-bit subvector extract from the first 256-bit vector position +// is a subregister copy that needs no instruction. +def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))), + (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>; +def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))), + (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>; + +def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))), + (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>; +def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))), + (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>; + +def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))), + (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>; +def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))), + (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>; + +// A 128-bit subvector insert to the first 256-bit vector position +// is a subregister copy that needs no instruction. +let AddedComplexity = 25 in { // to give priority over vinsertf128rm +def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)), + (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)), + (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)), + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)), + (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)), + (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)), + (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +} + +// Implicitly promote a 32-bit scalar to a vector. +def : Pat<(v4f32 (scalar_to_vector FR32:$src)), + (COPY_TO_REGCLASS FR32:$src, VR128)>; +def : Pat<(v8f32 (scalar_to_vector FR32:$src)), + (COPY_TO_REGCLASS FR32:$src, VR128)>; +// Implicitly promote a 64-bit scalar to a vector. +def : Pat<(v2f64 (scalar_to_vector FR64:$src)), + (COPY_TO_REGCLASS FR64:$src, VR128)>; +def : Pat<(v4f64 (scalar_to_vector FR64:$src)), + (COPY_TO_REGCLASS FR64:$src, VR128)>; + +// Bitcasts between 128-bit vector types. Return the original type since +// no instruction is needed for the conversion +let Predicates = [HasSSE2] in { + def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; +} + +// Bitcasts between 256-bit vector types. Return the original type since +// no instruction is needed for the conversion +let Predicates = [HasAVX] in { + def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>; + def : Pat<(v4f64 (bitconvert (v8i32 VR256:$src))), (v4f64 VR256:$src)>; + def : Pat<(v4f64 (bitconvert (v4i64 VR256:$src))), (v4f64 VR256:$src)>; + def : Pat<(v4f64 (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>; + def : Pat<(v4f64 (bitconvert (v32i8 VR256:$src))), (v4f64 VR256:$src)>; + def : Pat<(v8f32 (bitconvert (v8i32 VR256:$src))), (v8f32 VR256:$src)>; + def : Pat<(v8f32 (bitconvert (v4i64 VR256:$src))), (v8f32 VR256:$src)>; + def : Pat<(v8f32 (bitconvert (v4f64 VR256:$src))), (v8f32 VR256:$src)>; + def : Pat<(v8f32 (bitconvert (v32i8 VR256:$src))), (v8f32 VR256:$src)>; + def : Pat<(v8f32 (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>; + def : Pat<(v4i64 (bitconvert (v8f32 VR256:$src))), (v4i64 VR256:$src)>; + def : Pat<(v4i64 (bitconvert (v8i32 VR256:$src))), (v4i64 VR256:$src)>; + def : Pat<(v4i64 (bitconvert (v4f64 VR256:$src))), (v4i64 VR256:$src)>; + def : Pat<(v4i64 (bitconvert (v32i8 VR256:$src))), (v4i64 VR256:$src)>; + def : Pat<(v4i64 (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>; + def : Pat<(v32i8 (bitconvert (v4f64 VR256:$src))), (v32i8 VR256:$src)>; + def : Pat<(v32i8 (bitconvert (v4i64 VR256:$src))), (v32i8 VR256:$src)>; + def : Pat<(v32i8 (bitconvert (v8f32 VR256:$src))), (v32i8 VR256:$src)>; + def : Pat<(v32i8 (bitconvert (v8i32 VR256:$src))), (v32i8 VR256:$src)>; + def : Pat<(v32i8 (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>; + def : Pat<(v8i32 (bitconvert (v32i8 VR256:$src))), (v8i32 VR256:$src)>; + def : Pat<(v8i32 (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>; + def : Pat<(v8i32 (bitconvert (v8f32 VR256:$src))), (v8i32 VR256:$src)>; + def : Pat<(v8i32 (bitconvert (v4i64 VR256:$src))), (v8i32 VR256:$src)>; + def : Pat<(v8i32 (bitconvert (v4f64 VR256:$src))), (v8i32 VR256:$src)>; + def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))), (v16i16 VR256:$src)>; + def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))), (v16i16 VR256:$src)>; + def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))), (v16i16 VR256:$src)>; + def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))), (v16i16 VR256:$src)>; + def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))), (v16i16 VR256:$src)>; +} + +// Alias instructions that map fld0 to xorps for sse or vxorps for avx. +// This is expanded by ExpandPostRAPseudos. +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, SchedRW = [WriteZero] in { + def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", + [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>; + def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", + [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>; +} + +//===----------------------------------------------------------------------===// +// AVX & SSE - Zero/One Vectors +//===----------------------------------------------------------------------===// + +// Alias instruction that maps zero vector to pxor / xorp* for sse. +// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then +// swizzled by ExecutionDepsFix to pxor. +// We set canFoldAsLoad because this can be converted to a constant-pool +// load of an all-zeros value if folding it would be beneficial. +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, SchedRW = [WriteZero] in { +def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", + [(set VR128:$dst, (v4f32 immAllZerosV))]>; +} + +def : Pat<(v2f64 immAllZerosV), (V_SET0)>; +def : Pat<(v4i32 immAllZerosV), (V_SET0)>; +def : Pat<(v2i64 immAllZerosV), (V_SET0)>; +def : Pat<(v8i16 immAllZerosV), (V_SET0)>; +def : Pat<(v16i8 immAllZerosV), (V_SET0)>; + + +// The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI, +// and doesn't need it because on sandy bridge the register is set to zero +// at the rename stage without using any execution unit, so SET0PSY +// and SET0PDY can be used for vector int instructions without penalty +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, Predicates = [HasAVX], SchedRW = [WriteZero] in { +def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", + [(set VR256:$dst, (v8f32 immAllZerosV))]>; +} + +let Predicates = [HasAVX] in + def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>; + +let Predicates = [HasAVX2] in { + def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>; + def : Pat<(v8i32 immAllZerosV), (AVX_SET0)>; + def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>; + def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>; +} + +// AVX1 has no support for 256-bit integer instructions, but since the 128-bit +// VPXOR instruction writes zero to its upper part, it's safe build zeros. +let Predicates = [HasAVX1Only] in { +def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>; +def : Pat<(bc_v32i8 (v8f32 immAllZerosV)), + (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>; + +def : Pat<(v16i16 immAllZerosV), (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>; +def : Pat<(bc_v16i16 (v8f32 immAllZerosV)), + (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>; + +def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>; +def : Pat<(bc_v8i32 (v8f32 immAllZerosV)), + (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>; + +def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>; +def : Pat<(bc_v4i64 (v8f32 immAllZerosV)), + (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>; +} + +// We set canFoldAsLoad because this can be converted to a constant-pool +// load of an all-ones value if folding it would be beneficial. +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, SchedRW = [WriteZero] in { + def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "", + [(set VR128:$dst, (v4i32 immAllOnesV))]>; + let Predicates = [HasAVX2] in + def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "", + [(set VR256:$dst, (v8i32 immAllOnesV))]>; +} + + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Move FP Scalar Instructions +// +// Move Instructions. Register-to-register movss/movsd is not used for FR32/64 +// register copies because it's a partial register update; FsMOVAPSrr/FsMOVAPDrr +// is used instead. Register-to-register movss/movsd is not modeled as an +// INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable +// in terms of a copy, and just mentioned, we don't use movss/movsd for copies. +//===----------------------------------------------------------------------===// + +multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt, + X86MemOperand x86memop, string base_opc, + string asm_opr> { + def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, RC:$src2), + !strconcat(base_opc, asm_opr), + [(set VR128:$dst, (vt (OpNode VR128:$src1, + (scalar_to_vector RC:$src2))))], + IIC_SSE_MOV_S_RR>, Sched<[WriteMove]>; + + // For the disassembler + let isCodeGenOnly = 1, hasSideEffects = 0 in + def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src1, RC:$src2), + !strconcat(base_opc, asm_opr), + [], IIC_SSE_MOV_S_RR>, Sched<[WriteMove]>; +} + +multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, + X86MemOperand x86memop, string OpcodeStr> { + // AVX + defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, + VEX_4V, VEX_LIG; + + def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>, + VEX, VEX_LIG, Sched<[WriteStore]>; + // SSE1 & 2 + let Constraints = "$src1 = $dst" in { + defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr, + "\t{$src2, $dst|$dst, $src2}">; + } + + def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>, + Sched<[WriteStore]>; +} + +// Loading from memory automatically zeroing upper bits. +multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop, + PatFrag mem_pat, string OpcodeStr> { + def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (mem_pat addr:$src))], + IIC_SSE_MOV_S_RM>, VEX, VEX_LIG, Sched<[WriteLoad]>; + def NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (mem_pat addr:$src))], + IIC_SSE_MOV_S_RM>, Sched<[WriteLoad]>; +} + +defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss">, XS; +defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd">, XD; + +let canFoldAsLoad = 1, isReMaterializable = 1 in { + defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS; + + let AddedComplexity = 20 in + defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD; +} + +// Patterns +let Predicates = [HasAVX] in { + let AddedComplexity = 15 in { + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVS{S,D} to the lower bits. + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), + (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>; + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (VMOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (VMOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), + (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>; + + // Move low f32 and clear high bits. + def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (VMOVSSrr (v4f32 (V_SET0)), + (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), sub_xmm)>; + def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (VMOVSSrr (v4i32 (V_SET0)), + (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), sub_xmm)>; + } + + let AddedComplexity = 20 in { + // MOVSSrm zeros the high parts of the register; represent this + // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), + (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; + def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), + (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; + def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), + (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; + + // MOVSDrm zeros the high parts of the register; represent this + // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), + (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; + def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), + (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; + def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), + (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; + def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), + (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; + def : Pat<(v2f64 (X86vzload addr:$src)), + (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; + + // Represent the same patterns above but in the form they appear for + // 256-bit types + def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; + def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, + (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; + def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, + (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; + } + def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, + (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), + (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)), + sub_xmm)>; + def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, + (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))), + (SUBREG_TO_REG (i64 0), + (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), + sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>; + + // Move low f64 and clear high bits. + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (VMOVSDrr (v2f64 (V_SET0)), + (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), sub_xmm)>; + + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (VMOVSDrr (v2i64 (V_SET0)), + (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), sub_xmm)>; + + // Extract and store. + def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), + addr:$dst), + (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>; + def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), + addr:$dst), + (VMOVSDmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64))>; + + // Shuffle with VMOVSS + def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), + (VMOVSSrr (v4i32 VR128:$src1), + (COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>; + def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), + (VMOVSSrr (v4f32 VR128:$src1), + (COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>; + + // 256-bit variants + def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm), + (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)), + sub_xmm)>; + def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm), + (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)), + sub_xmm)>; + + // Shuffle with VMOVSD + def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + + // 256-bit variants + def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm), + (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)), + sub_xmm)>; + def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm), + (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)), + sub_xmm)>; + + + // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem + // is during lowering, where it's not possible to recognize the fold cause + // it has two uses through a bitcast. One use disappears at isel time and the + // fold opportunity reappears. + def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; +} + +let Predicates = [UseSSE1] in { + let AddedComplexity = 15 in { + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVSS to the lower bits. + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), + (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>; + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; + } + + let AddedComplexity = 20 in { + // MOVSSrm already zeros the high parts of the register. + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), + (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; + def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), + (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; + def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), + (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; + } + + // Extract and store. + def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), + addr:$dst), + (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>; + + // Shuffle with MOVSS + def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), + (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>; + def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), + (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>; +} + +let Predicates = [UseSSE2] in { + let AddedComplexity = 15 in { + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVSD to the lower bits. + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), + (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>; + } + + let AddedComplexity = 20 in { + // MOVSDrm already zeros the high parts of the register. + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), + (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; + def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), + (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; + def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), + (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; + def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), + (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; + def : Pat<(v2f64 (X86vzload addr:$src)), + (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; + } + + // Extract and store. + def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), + addr:$dst), + (MOVSDmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR64))>; + + // Shuffle with MOVSD + def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + + // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem + // is during lowering, where it's not possible to recognize the fold cause + // it has two uses through a bitcast. One use disappears at isel time and the + // fold opportunity reappears. + def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions +//===----------------------------------------------------------------------===// + +multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC, + X86MemOperand x86memop, PatFrag ld_frag, + string asm, Domain d, + OpndItins itins, + bit IsReMaterializable = 1> { +let neverHasSideEffects = 1 in + def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>, + Sched<[WriteMove]>; +let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in + def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>, + Sched<[WriteLoad]>; +} + +defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, + "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, + TB, VEX; +defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, + "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, + TB, OpSize, VEX; +defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, + "movups", SSEPackedSingle, SSE_MOVU_ITINS>, + TB, VEX; +defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, + "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, + TB, OpSize, VEX; + +defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, + "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, + TB, VEX, VEX_L; +defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, + "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, + TB, OpSize, VEX, VEX_L; +defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, + "movups", SSEPackedSingle, SSE_MOVU_ITINS>, + TB, VEX, VEX_L; +defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, + "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, + TB, OpSize, VEX, VEX_L; +defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, + "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, + TB; +defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, + "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, + TB, OpSize; +defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, + "movups", SSEPackedSingle, SSE_MOVU_ITINS>, + TB; +defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, + "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, + TB, OpSize; + +let SchedRW = [WriteStore] in { +def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movaps\t{$src, $dst|$dst, $src}", + [(alignedstore (v4f32 VR128:$src), addr:$dst)], + IIC_SSE_MOVA_P_MR>, VEX; +def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movapd\t{$src, $dst|$dst, $src}", + [(alignedstore (v2f64 VR128:$src), addr:$dst)], + IIC_SSE_MOVA_P_MR>, VEX; +def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movups\t{$src, $dst|$dst, $src}", + [(store (v4f32 VR128:$src), addr:$dst)], + IIC_SSE_MOVU_P_MR>, VEX; +def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movupd\t{$src, $dst|$dst, $src}", + [(store (v2f64 VR128:$src), addr:$dst)], + IIC_SSE_MOVU_P_MR>, VEX; +def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), + "movaps\t{$src, $dst|$dst, $src}", + [(alignedstore256 (v8f32 VR256:$src), addr:$dst)], + IIC_SSE_MOVA_P_MR>, VEX, VEX_L; +def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), + "movapd\t{$src, $dst|$dst, $src}", + [(alignedstore256 (v4f64 VR256:$src), addr:$dst)], + IIC_SSE_MOVA_P_MR>, VEX, VEX_L; +def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), + "movups\t{$src, $dst|$dst, $src}", + [(store (v8f32 VR256:$src), addr:$dst)], + IIC_SSE_MOVU_P_MR>, VEX, VEX_L; +def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), + "movupd\t{$src, $dst|$dst, $src}", + [(store (v4f64 VR256:$src), addr:$dst)], + IIC_SSE_MOVU_P_MR>, VEX, VEX_L; +} // SchedRW + +// For disassembler +let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in { + def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src), + "movaps\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>, VEX; + def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src), + "movapd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>, VEX; + def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src), + "movups\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>, VEX; + def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src), + "movupd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>, VEX; + def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst), + (ins VR256:$src), + "movaps\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>, VEX, VEX_L; + def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst), + (ins VR256:$src), + "movapd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>, VEX, VEX_L; + def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst), + (ins VR256:$src), + "movups\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>, VEX, VEX_L; + def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst), + (ins VR256:$src), + "movupd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>, VEX, VEX_L; +} + +let Predicates = [HasAVX] in { +def : Pat<(v8i32 (X86vzmovl + (insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; +def : Pat<(v4i64 (X86vzmovl + (insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; +def : Pat<(v8f32 (X86vzmovl + (insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; +def : Pat<(v4f64 (X86vzmovl + (insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; +} + + +def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src), + (VMOVUPSYmr addr:$dst, VR256:$src)>; +def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src), + (VMOVUPDYmr addr:$dst, VR256:$src)>; + +let SchedRW = [WriteStore] in { +def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movaps\t{$src, $dst|$dst, $src}", + [(alignedstore (v4f32 VR128:$src), addr:$dst)], + IIC_SSE_MOVA_P_MR>; +def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movapd\t{$src, $dst|$dst, $src}", + [(alignedstore (v2f64 VR128:$src), addr:$dst)], + IIC_SSE_MOVA_P_MR>; +def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movups\t{$src, $dst|$dst, $src}", + [(store (v4f32 VR128:$src), addr:$dst)], + IIC_SSE_MOVU_P_MR>; +def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movupd\t{$src, $dst|$dst, $src}", + [(store (v2f64 VR128:$src), addr:$dst)], + IIC_SSE_MOVU_P_MR>; +} // SchedRW + +// For disassembler +let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in { + def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movaps\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>; + def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movapd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>; + def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movups\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>; + def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movupd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>; +} + +let Predicates = [HasAVX] in { + def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src), + (VMOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src), + (VMOVUPDmr addr:$dst, VR128:$src)>; +} + +let Predicates = [UseSSE1] in + def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src), + (MOVUPSmr addr:$dst, VR128:$src)>; +let Predicates = [UseSSE2] in + def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src), + (MOVUPDmr addr:$dst, VR128:$src)>; + +// Use vmovaps/vmovups for AVX integer load/store. +let Predicates = [HasAVX] in { + // 128-bit load/store + def : Pat<(alignedloadv2i64 addr:$src), + (VMOVAPSrm addr:$src)>; + def : Pat<(loadv2i64 addr:$src), + (VMOVUPSrm addr:$src)>; + + def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), + (VMOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), + (VMOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), + (VMOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), + (VMOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v2i64 VR128:$src), addr:$dst), + (VMOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v4i32 VR128:$src), addr:$dst), + (VMOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v8i16 VR128:$src), addr:$dst), + (VMOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v16i8 VR128:$src), addr:$dst), + (VMOVUPSmr addr:$dst, VR128:$src)>; + + // 256-bit load/store + def : Pat<(alignedloadv4i64 addr:$src), + (VMOVAPSYrm addr:$src)>; + def : Pat<(loadv4i64 addr:$src), + (VMOVUPSYrm addr:$src)>; + def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst), + (VMOVAPSYmr addr:$dst, VR256:$src)>; + def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst), + (VMOVAPSYmr addr:$dst, VR256:$src)>; + def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst), + (VMOVAPSYmr addr:$dst, VR256:$src)>; + def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst), + (VMOVAPSYmr addr:$dst, VR256:$src)>; + def : Pat<(store (v4i64 VR256:$src), addr:$dst), + (VMOVUPSYmr addr:$dst, VR256:$src)>; + def : Pat<(store (v8i32 VR256:$src), addr:$dst), + (VMOVUPSYmr addr:$dst, VR256:$src)>; + def : Pat<(store (v16i16 VR256:$src), addr:$dst), + (VMOVUPSYmr addr:$dst, VR256:$src)>; + def : Pat<(store (v32i8 VR256:$src), addr:$dst), + (VMOVUPSYmr addr:$dst, VR256:$src)>; + + // Special patterns for storing subvector extracts of lower 128-bits + // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr + def : Pat<(alignedstore (v2f64 (extract_subvector + (v4f64 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v4f32 (extract_subvector + (v8f32 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v2i64 (extract_subvector + (v4i64 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v4i32 (extract_subvector + (v8i32 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v8i16 (extract_subvector + (v16i16 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v16i8 (extract_subvector + (v32i8 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + + def : Pat<(store (v2f64 (extract_subvector + (v4f64 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v4f32 (extract_subvector + (v8f32 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v2i64 (extract_subvector + (v4i64 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v4i32 (extract_subvector + (v8i32 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v8i16 (extract_subvector + (v16i16 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVUPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v16i8 (extract_subvector + (v32i8 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; +} + +// Use movaps / movups for SSE integer load / store (one byte shorter). +// The instructions selected below are then converted to MOVDQA/MOVDQU +// during the SSE domain pass. +let Predicates = [UseSSE1] in { + def : Pat<(alignedloadv2i64 addr:$src), + (MOVAPSrm addr:$src)>; + def : Pat<(loadv2i64 addr:$src), + (MOVUPSrm addr:$src)>; + + def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v2i64 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v4i32 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v8i16 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v16i8 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; +} + +// Alias instruction to do FR32 or FR64 reg-to-reg copy using movaps. Upper +// bits are disregarded. FIXME: Set encoding to pseudo! +let neverHasSideEffects = 1, SchedRW = [WriteMove] in { +def FsVMOVAPSrr : VPSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src), + "movaps\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>, VEX; +def FsVMOVAPDrr : VPDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src), + "movapd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>, VEX; +def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src), + "movaps\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>; +def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src), + "movapd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>; +} + +// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper +// bits are disregarded. FIXME: Set encoding to pseudo! +let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in { +let isCodeGenOnly = 1 in { + def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src), + "movaps\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (alignedloadfsf32 addr:$src))], + IIC_SSE_MOVA_P_RM>, VEX; + def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src), + "movapd\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (alignedloadfsf64 addr:$src))], + IIC_SSE_MOVA_P_RM>, VEX; +} +def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src), + "movaps\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (alignedloadfsf32 addr:$src))], + IIC_SSE_MOVA_P_RM>; +def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src), + "movapd\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (alignedloadfsf64 addr:$src))], + IIC_SSE_MOVA_P_RM>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Move Low packed FP Instructions +//===----------------------------------------------------------------------===// + +multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode, + string base_opc, string asm_opr, + InstrItinClass itin> { + def PSrm : PI<opc, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), + !strconcat(base_opc, "s", asm_opr), + [(set VR128:$dst, + (psnode VR128:$src1, + (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))], + itin, SSEPackedSingle>, TB, + Sched<[WriteShuffleLd, ReadAfterLd]>; + + def PDrm : PI<opc, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), + !strconcat(base_opc, "d", asm_opr), + [(set VR128:$dst, (v2f64 (pdnode VR128:$src1, + (scalar_to_vector (loadf64 addr:$src2)))))], + itin, SSEPackedDouble>, TB, OpSize, + Sched<[WriteShuffleLd, ReadAfterLd]>; + +} + +multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode, + string base_opc, InstrItinClass itin> { + defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}", + itin>, VEX_4V; + +let Constraints = "$src1 = $dst" in + defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc, + "\t{$src2, $dst|$dst, $src2}", + itin>; +} + +let AddedComplexity = 20 in { + defm MOVL : sse12_mov_hilo_packed<0x12, X86Movlps, X86Movlpd, "movlp", + IIC_SSE_MOV_LH>; +} + +let SchedRW = [WriteStore] in { +def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movlps\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)), + (iPTR 0))), addr:$dst)], + IIC_SSE_MOV_LH>, VEX; +def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movlpd\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract (v2f64 VR128:$src), + (iPTR 0))), addr:$dst)], + IIC_SSE_MOV_LH>, VEX; +def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movlps\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)), + (iPTR 0))), addr:$dst)], + IIC_SSE_MOV_LH>; +def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movlpd\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract (v2f64 VR128:$src), + (iPTR 0))), addr:$dst)], + IIC_SSE_MOV_LH>; +} // SchedRW + +let Predicates = [HasAVX] in { + // Shuffle with VMOVLPS + def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))), + (VMOVLPSrm VR128:$src1, addr:$src2)>; + def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))), + (VMOVLPSrm VR128:$src1, addr:$src2)>; + + // Shuffle with VMOVLPD + def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))), + (VMOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), + (VMOVLPDrm VR128:$src1, addr:$src2)>; + + // Store patterns + def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), + addr:$src1), + (VMOVLPSmr addr:$src1, VR128:$src2)>; + def : Pat<(store (v4i32 (X86Movlps + (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1), + (VMOVLPSmr addr:$src1, VR128:$src2)>; + def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)), + addr:$src1), + (VMOVLPDmr addr:$src1, VR128:$src2)>; + def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)), + addr:$src1), + (VMOVLPDmr addr:$src1, VR128:$src2)>; +} + +let Predicates = [UseSSE1] in { + // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS + def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)), + (iPTR 0))), addr:$src1), + (MOVLPSmr addr:$src1, VR128:$src2)>; + + // Shuffle with MOVLPS + def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))), + (MOVLPSrm VR128:$src1, addr:$src2)>; + def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))), + (MOVLPSrm VR128:$src1, addr:$src2)>; + def : Pat<(X86Movlps VR128:$src1, + (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), + (MOVLPSrm VR128:$src1, addr:$src2)>; + + // Store patterns + def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), + addr:$src1), + (MOVLPSmr addr:$src1, VR128:$src2)>; + def : Pat<(store (v4i32 (X86Movlps + (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), + addr:$src1), + (MOVLPSmr addr:$src1, VR128:$src2)>; +} + +let Predicates = [UseSSE2] in { + // Shuffle with MOVLPD + def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))), + (MOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), + (MOVLPDrm VR128:$src1, addr:$src2)>; + + // Store patterns + def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)), + addr:$src1), + (MOVLPDmr addr:$src1, VR128:$src2)>; + def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)), + addr:$src1), + (MOVLPDmr addr:$src1, VR128:$src2)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Move Hi packed FP Instructions +//===----------------------------------------------------------------------===// + +let AddedComplexity = 20 in { + defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Movlhpd, "movhp", + IIC_SSE_MOV_LH>; +} + +let SchedRW = [WriteStore] in { +// v2f64 extract element 1 is always custom lowered to unpack high to low +// and extract element 0 so the non-store version isn't too horrible. +def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movhps\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract + (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), + (bc_v2f64 (v4f32 VR128:$src))), + (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; +def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movhpd\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract + (v2f64 (X86Unpckh VR128:$src, VR128:$src)), + (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; +def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movhps\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract + (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), + (bc_v2f64 (v4f32 VR128:$src))), + (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>; +def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movhpd\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract + (v2f64 (X86Unpckh VR128:$src, VR128:$src)), + (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>; +} // SchedRW + +let Predicates = [HasAVX] in { + // VMOVHPS patterns + def : Pat<(X86Movlhps VR128:$src1, + (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), + (VMOVHPSrm VR128:$src1, addr:$src2)>; + def : Pat<(X86Movlhps VR128:$src1, + (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), + (VMOVHPSrm VR128:$src1, addr:$src2)>; + + // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem + // is during lowering, where it's not possible to recognize the load fold + // cause it has two uses through a bitcast. One use disappears at isel time + // and the fold opportunity reappears. + def : Pat<(v2f64 (X86Unpckl VR128:$src1, + (scalar_to_vector (loadf64 addr:$src2)))), + (VMOVHPDrm VR128:$src1, addr:$src2)>; +} + +let Predicates = [UseSSE1] in { + // MOVHPS patterns + def : Pat<(X86Movlhps VR128:$src1, + (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), + (MOVHPSrm VR128:$src1, addr:$src2)>; + def : Pat<(X86Movlhps VR128:$src1, + (bc_v4f32 (v2i64 (X86vzload addr:$src2)))), + (MOVHPSrm VR128:$src1, addr:$src2)>; +} + +let Predicates = [UseSSE2] in { + // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem + // is during lowering, where it's not possible to recognize the load fold + // cause it has two uses through a bitcast. One use disappears at isel time + // and the fold opportunity reappears. + def : Pat<(v2f64 (X86Unpckl VR128:$src1, + (scalar_to_vector (loadf64 addr:$src2)))), + (MOVHPDrm VR128:$src1, addr:$src2)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions +//===----------------------------------------------------------------------===// + +let AddedComplexity = 20 in { + def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))], + IIC_SSE_MOV_LH>, + VEX_4V, Sched<[WriteShuffle]>; + def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))], + IIC_SSE_MOV_LH>, + VEX_4V, Sched<[WriteShuffle]>; +} +let Constraints = "$src1 = $dst", AddedComplexity = 20 in { + def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + "movlhps\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))], + IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>; + def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + "movhlps\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))], + IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>; +} + +let Predicates = [HasAVX] in { + // MOVLHPS patterns + def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)), + (VMOVLHPSrr VR128:$src1, VR128:$src2)>; + def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)), + (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>; + + // MOVHLPS patterns + def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)), + (VMOVHLPSrr VR128:$src1, VR128:$src2)>; +} + +let Predicates = [UseSSE1] in { + // MOVLHPS patterns + def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)), + (MOVLHPSrr VR128:$src1, VR128:$src2)>; + def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)), + (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>; + + // MOVHLPS patterns + def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)), + (MOVHLPSrr VR128:$src1, VR128:$src2)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Conversion Instructions +//===----------------------------------------------------------------------===// + +def SSE_CVT_PD : OpndItins< + IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM +>; + +let Sched = WriteCvtI2F in +def SSE_CVT_PS : OpndItins< + IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM +>; + +let Sched = WriteCvtI2F in +def SSE_CVT_Scalar : OpndItins< + IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM +>; + +let Sched = WriteCvtF2I in +def SSE_CVT_SS2SI_32 : OpndItins< + IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM +>; + +let Sched = WriteCvtF2I in +def SSE_CVT_SS2SI_64 : OpndItins< + IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM +>; + +let Sched = WriteCvtF2I in +def SSE_CVT_SD2SI : OpndItins< + IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM +>; + +multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, + SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, + string asm, OpndItins itins> { + def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, + [(set DstRC:$dst, (OpNode SrcRC:$src))], + itins.rr>, Sched<[itins.Sched]>; + def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, + [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))], + itins.rm>, Sched<[itins.Sched.Folded]>; +} + +multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, + X86MemOperand x86memop, string asm, Domain d, + OpndItins itins> { +let neverHasSideEffects = 1 in { + def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, + [], itins.rr, d>, Sched<[itins.Sched]>; + let mayLoad = 1 in + def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, + [], itins.rm, d>, Sched<[itins.Sched.Folded]>; +} +} + +multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, + X86MemOperand x86memop, string asm> { +let neverHasSideEffects = 1 in { + def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, + Sched<[WriteCvtI2F]>; + let mayLoad = 1 in + def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), + (ins DstRC:$src1, x86memop:$src), + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, + Sched<[WriteCvtI2FLd, ReadAfterLd]>; +} // neverHasSideEffects = 1 +} + +defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, + "cvttss2si\t{$src, $dst|$dst, $src}", + SSE_CVT_SS2SI_32>, + XS, VEX, VEX_LIG; +defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, + "cvttss2si\t{$src, $dst|$dst, $src}", + SSE_CVT_SS2SI_64>, + XS, VEX, VEX_W, VEX_LIG; +defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, + "cvttsd2si\t{$src, $dst|$dst, $src}", + SSE_CVT_SD2SI>, + XD, VEX, VEX_LIG; +defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, + "cvttsd2si\t{$src, $dst|$dst, $src}", + SSE_CVT_SD2SI>, + XD, VEX, VEX_W, VEX_LIG; + +def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>; +def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>; +def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>; +def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>; +def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>; +def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>; +def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>; +def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>; + +// The assembler can recognize rr 64-bit instructions by seeing a rxx +// register, but the same isn't true when only using memory operands, +// provide other assembly "l" and "q" forms to address this explicitly +// where appropriate to do so. +defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">, + XS, VEX_4V, VEX_LIG; +defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">, + XS, VEX_4V, VEX_W, VEX_LIG; +defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">, + XD, VEX_4V, VEX_LIG; +defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">, + XD, VEX_4V, VEX_W, VEX_LIG; + +def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", + (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src)>; +def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", + (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>; + +let Predicates = [HasAVX] in { + def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), + (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; + def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), + (VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>; + def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))), + (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; + def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))), + (VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>; + + def : Pat<(f32 (sint_to_fp GR32:$src)), + (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>; + def : Pat<(f32 (sint_to_fp GR64:$src)), + (VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>; + def : Pat<(f64 (sint_to_fp GR32:$src)), + (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>; + def : Pat<(f64 (sint_to_fp GR64:$src)), + (VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>; +} + +defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, + "cvttss2si\t{$src, $dst|$dst, $src}", + SSE_CVT_SS2SI_32>, XS; +defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, + "cvttss2si\t{$src, $dst|$dst, $src}", + SSE_CVT_SS2SI_64>, XS, REX_W; +defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, + "cvttsd2si\t{$src, $dst|$dst, $src}", + SSE_CVT_SD2SI>, XD; +defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, + "cvttsd2si\t{$src, $dst|$dst, $src}", + SSE_CVT_SD2SI>, XD, REX_W; +defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32, + "cvtsi2ss{l}\t{$src, $dst|$dst, $src}", + SSE_CVT_Scalar>, XS; +defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64, + "cvtsi2ss{q}\t{$src, $dst|$dst, $src}", + SSE_CVT_Scalar>, XS, REX_W; +defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32, + "cvtsi2sd{l}\t{$src, $dst|$dst, $src}", + SSE_CVT_Scalar>, XD; +defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64, + "cvtsi2sd{q}\t{$src, $dst|$dst, $src}", + SSE_CVT_Scalar>, XD, REX_W; + +def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>; +def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>; +def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSD2SIrr GR32:$dst, FR64:$src), 0>; +def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>; +def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>; +def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>; +def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>; +def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>; + +def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}", + (CVTSI2SSrm FR64:$dst, i32mem:$src)>; +def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", + (CVTSI2SDrm FR64:$dst, i32mem:$src)>; + +// Conversion Instructions Intrinsics - Match intrinsics which expect MM +// and/or XMM operand(s). + +multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, + Intrinsic Int, Operand memop, ComplexPattern mem_cpat, + string asm, OpndItins itins> { + def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>, + Sched<[itins.Sched]>; + def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>, + Sched<[itins.Sched.Folded]>; +} + +multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, + RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop, + PatFrag ld_frag, string asm, OpndItins itins, + bit Is2Addr = 1> { + def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2), + !if(Is2Addr, + !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], + itins.rr>, Sched<[itins.Sched]>; + def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), + (ins DstRC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], + itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, + int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si", + SSE_CVT_SD2SI>, XD, VEX, VEX_LIG; +defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, + int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si", + SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG; + +defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, + sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD; +defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64, + sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W; + + +defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}", + SSE_CVT_Scalar, 0>, XS, VEX_4V; +defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}", + SSE_CVT_Scalar, 0>, XS, VEX_4V, + VEX_W; +defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}", + SSE_CVT_Scalar, 0>, XD, VEX_4V; +defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}", + SSE_CVT_Scalar, 0>, XD, + VEX_4V, VEX_W; + +let Constraints = "$src1 = $dst" in { + defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + int_x86_sse_cvtsi2ss, i32mem, loadi32, + "cvtsi2ss{l}", SSE_CVT_Scalar>, XS; + defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + int_x86_sse_cvtsi642ss, i64mem, loadi64, + "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W; + defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + int_x86_sse2_cvtsi2sd, i32mem, loadi32, + "cvtsi2sd{l}", SSE_CVT_Scalar>, XD; + defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + int_x86_sse2_cvtsi642sd, i64mem, loadi64, + "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W; +} + +/// SSE 1 Only + +// Aliases for intrinsics +defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, + ssmem, sse_load_f32, "cvttss2si", + SSE_CVT_SS2SI_32>, XS, VEX; +defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, + int_x86_sse_cvttss2si64, ssmem, sse_load_f32, + "cvttss2si", SSE_CVT_SS2SI_64>, + XS, VEX, VEX_W; +defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, + sdmem, sse_load_f64, "cvttsd2si", + SSE_CVT_SD2SI>, XD, VEX; +defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, + int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, + "cvttsd2si", SSE_CVT_SD2SI>, + XD, VEX, VEX_W; +defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, + ssmem, sse_load_f32, "cvttss2si", + SSE_CVT_SS2SI_32>, XS; +defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, + int_x86_sse_cvttss2si64, ssmem, sse_load_f32, + "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W; +defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, + sdmem, sse_load_f64, "cvttsd2si", + SSE_CVT_SD2SI>, XD; +defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, + int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, + "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W; + +defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, + ssmem, sse_load_f32, "cvtss2si", + SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG; +defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, + ssmem, sse_load_f32, "cvtss2si", + SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG; + +defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, + ssmem, sse_load_f32, "cvtss2si", + SSE_CVT_SS2SI_32>, XS; +defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, + ssmem, sse_load_f32, "cvtss2si", + SSE_CVT_SS2SI_64>, XS, REX_W; + +defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem, + "vcvtdq2ps\t{$src, $dst|$dst, $src}", + SSEPackedSingle, SSE_CVT_PS>, + TB, VEX, Requires<[HasAVX]>; +defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, VR256, i256mem, + "vcvtdq2ps\t{$src, $dst|$dst, $src}", + SSEPackedSingle, SSE_CVT_PS>, + TB, VEX, VEX_L, Requires<[HasAVX]>; + +defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem, + "cvtdq2ps\t{$src, $dst|$dst, $src}", + SSEPackedSingle, SSE_CVT_PS>, + TB, Requires<[UseSSE2]>; + +def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", + (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>; +def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", + (VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>; +def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", + (VCVTSD2SIrr GR32:$dst, VR128:$src), 0>; +def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", + (VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>; +def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", + (VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>; +def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", + (VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>; +def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", + (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>; +def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", + (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>; + +def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", + (CVTSS2SIrr GR32:$dst, VR128:$src), 0>; +def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", + (CVTSS2SIrm GR32:$dst, ssmem:$src), 0>; +def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", + (CVTSD2SIrr GR32:$dst, VR128:$src), 0>; +def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", + (CVTSD2SIrm GR32:$dst, sdmem:$src), 0>; +def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", + (CVTSS2SI64rr GR64:$dst, VR128:$src), 0>; +def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", + (CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>; +def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", + (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>; +def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", + (CVTSD2SI64rm GR64:$dst, sdmem:$src)>; + +/// SSE 2 Only + +// Convert scalar double to scalar single +let neverHasSideEffects = 1 in { +def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), + (ins FR64:$src1, FR64:$src2), + "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], + IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG, + Sched<[WriteCvtF2F]>; +let mayLoad = 1 in +def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), + (ins FR64:$src1, f64mem:$src2), + "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [], IIC_SSE_CVT_Scalar_RM>, + XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG, + Sched<[WriteCvtF2FLd, ReadAfterLd]>; +} + +def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>, + Requires<[HasAVX]>; + +def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), + "cvtsd2ss\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (fround FR64:$src))], + IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>; +def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), + "cvtsd2ss\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (fround (loadf64 addr:$src)))], + IIC_SSE_CVT_Scalar_RM>, + XD, + Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>; + +def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))], + IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>, + Sched<[WriteCvtF2F]>; +def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), + "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, (int_x86_sse2_cvtsd2ss + VR128:$src1, sse_load_f64:$src2))], + IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>, + Sched<[WriteCvtF2FLd, ReadAfterLd]>; + +let Constraints = "$src1 = $dst" in { +def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))], + IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>, + Sched<[WriteCvtF2F]>; +def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), + "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, (int_x86_sse2_cvtsd2ss + VR128:$src1, sse_load_f64:$src2))], + IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>, + Sched<[WriteCvtF2FLd, ReadAfterLd]>; +} + +// Convert scalar single to scalar double +// SSE2 instructions with XS prefix +let neverHasSideEffects = 1 in { +def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), + (ins FR32:$src1, FR32:$src2), + "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [], IIC_SSE_CVT_Scalar_RR>, + XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG, + Sched<[WriteCvtF2F]>; +let mayLoad = 1 in +def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), + (ins FR32:$src1, f32mem:$src2), + "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [], IIC_SSE_CVT_Scalar_RM>, + XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>, + Sched<[WriteCvtF2FLd, ReadAfterLd]>; +} + +def : Pat<(f64 (fextend FR32:$src)), + (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>; +def : Pat<(fextend (loadf32 addr:$src)), + (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX]>; + +def : Pat<(extloadf32 addr:$src), + (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasAVX, OptForSize]>; +def : Pat<(extloadf32 addr:$src), + (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>, + Requires<[HasAVX, OptForSpeed]>; + +def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), + "cvtss2sd\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (fextend FR32:$src))], + IIC_SSE_CVT_Scalar_RR>, XS, + Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>; +def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), + "cvtss2sd\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (extloadf32 addr:$src))], + IIC_SSE_CVT_Scalar_RM>, XS, + Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>; + +// extload f32 -> f64. This matches load+fextend because we have a hack in +// the isel (PreprocessForFPConvert) that can introduce loads after dag +// combine. +// Since these loads aren't folded into the fextend, we have to match it +// explicitly here. +def : Pat<(fextend (loadf32 addr:$src)), + (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>; +def : Pat<(extloadf32 addr:$src), + (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>; + +def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))], + IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>, + Sched<[WriteCvtF2F]>; +def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), + "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))], + IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>, + Sched<[WriteCvtF2FLd, ReadAfterLd]>; +let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix +def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + "cvtss2sd\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))], + IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>, + Sched<[WriteCvtF2F]>; +def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), + "cvtss2sd\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))], + IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>, + Sched<[WriteCvtF2FLd, ReadAfterLd]>; +} + +// Convert packed single/double fp to doubleword +def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))], + IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>; +def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))], + IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>; +def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (int_x86_avx_cvt_ps2dq_256 VR256:$src))], + IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; +def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)))], + IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; +def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))], + IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>; +def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))], + IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>; + + +// Convert Packed Double FP to Packed DW Integers +let Predicates = [HasAVX] in { +// The assembler can recognize rr 256-bit instructions by seeing a ymm +// register, but the same isn't true when using memory operands instead. +// Provide other assembly rr and rm forms to address this explicitly. +def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vcvtpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>, + VEX, Sched<[WriteCvtF2I]>; + +// XMM only +def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", + (VCVTPD2DQrr VR128:$dst, VR128:$src)>; +def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "vcvtpd2dqx\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))]>, VEX, + Sched<[WriteCvtF2ILd]>; + +// YMM only +def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), + "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L, + Sched<[WriteCvtF2I]>; +def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), + "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)))]>, + VEX, VEX_L, Sched<[WriteCvtF2ILd]>; +def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}", + (VCVTPD2DQYrr VR128:$dst, VR256:$src)>; +} + +def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))], + IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>; +def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))], + IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>; + +// Convert with truncation packed single/double fp to doubleword +// SSE2 packed instructions with XS prefix +def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvttps2dq VR128:$src))], + IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>; +def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvttps2dq + (memopv4f32 addr:$src)))], + IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>; +def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (int_x86_avx_cvtt_ps2dq_256 VR256:$src))], + IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; +def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256 + (memopv8f32 addr:$src)))], + IIC_SSE_CVT_PS_RM>, VEX, VEX_L, + Sched<[WriteCvtF2ILd]>; + +def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))], + IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>; +def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))], + IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>; + +let Predicates = [HasAVX] in { + def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), + (VCVTDQ2PSrr VR128:$src)>; + def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))), + (VCVTDQ2PSrm addr:$src)>; + + def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src), + (VCVTDQ2PSrr VR128:$src)>; + def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))), + (VCVTDQ2PSrm addr:$src)>; + + def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), + (VCVTTPS2DQrr VR128:$src)>; + def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))), + (VCVTTPS2DQrm addr:$src)>; + + def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))), + (VCVTDQ2PSYrr VR256:$src)>; + def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (memopv4i64 addr:$src)))), + (VCVTDQ2PSYrm addr:$src)>; + + def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))), + (VCVTTPS2DQYrr VR256:$src)>; + def : Pat<(v8i32 (fp_to_sint (memopv8f32 addr:$src))), + (VCVTTPS2DQYrm addr:$src)>; +} + +let Predicates = [UseSSE2] in { + def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), + (CVTDQ2PSrr VR128:$src)>; + def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))), + (CVTDQ2PSrm addr:$src)>; + + def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src), + (CVTDQ2PSrr VR128:$src)>; + def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))), + (CVTDQ2PSrm addr:$src)>; + + def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), + (CVTTPS2DQrr VR128:$src)>; + def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))), + (CVTTPS2DQrm addr:$src)>; +} + +def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvttpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvttpd2dq VR128:$src))], + IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>; + +// The assembler can recognize rr 256-bit instructions by seeing a ymm +// register, but the same isn't true when using memory operands instead. +// Provide other assembly rr and rm forms to address this explicitly. + +// XMM only +def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", + (VCVTTPD2DQrr VR128:$dst, VR128:$src)>; +def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvttpd2dqx\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvttpd2dq + (memopv2f64 addr:$src)))], + IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>; + +// YMM only +def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), + "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_avx_cvtt_pd2dq_256 VR256:$src))], + IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; +def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), + "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)))], + IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; +def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}", + (VCVTTPD2DQYrr VR128:$dst, VR256:$src)>; + +let Predicates = [HasAVX] in { + def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), + (VCVTTPD2DQYrr VR256:$src)>; + def : Pat<(v4i32 (fp_to_sint (memopv4f64 addr:$src))), + (VCVTTPD2DQYrm addr:$src)>; +} // Predicates = [HasAVX] + +def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvttpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))], + IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>; +def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), + "cvttpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvttpd2dq + (memopv2f64 addr:$src)))], + IIC_SSE_CVT_PD_RM>, + Sched<[WriteCvtF2ILd]>; + +// Convert packed single to packed double +let Predicates = [HasAVX] in { + // SSE2 instructions without OpSize prefix +def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vcvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))], + IIC_SSE_CVT_PD_RR>, TB, VEX, Sched<[WriteCvtF2F]>; +def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), + "vcvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))], + IIC_SSE_CVT_PD_RM>, TB, VEX, Sched<[WriteCvtF2FLd]>; +def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + "vcvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (int_x86_avx_cvt_ps2_pd_256 VR128:$src))], + IIC_SSE_CVT_PD_RR>, TB, VEX, VEX_L, Sched<[WriteCvtF2F]>; +def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), + "vcvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)))], + IIC_SSE_CVT_PD_RM>, TB, VEX, VEX_L, Sched<[WriteCvtF2FLd]>; +} + +let Predicates = [UseSSE2] in { +def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))], + IIC_SSE_CVT_PD_RR>, TB, Sched<[WriteCvtF2F]>; +def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), + "cvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))], + IIC_SSE_CVT_PD_RM>, TB, Sched<[WriteCvtF2FLd]>; +} + +// Convert Packed DW Integers to Packed Double FP +let Predicates = [HasAVX] in { +let neverHasSideEffects = 1, mayLoad = 1 in +def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", + []>, VEX, Sched<[WriteCvtI2FLd]>; +def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvtdq2pd VR128:$src))]>, VEX, + Sched<[WriteCvtI2F]>; +def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (int_x86_avx_cvtdq2_pd_256 + (bitconvert (memopv2i64 addr:$src))))]>, VEX, VEX_L, + Sched<[WriteCvtI2FLd]>; +def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L, + Sched<[WriteCvtI2F]>; +} + +let neverHasSideEffects = 1, mayLoad = 1 in +def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "cvtdq2pd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>; +def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtdq2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))], + IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>; + +// AVX 256-bit register conversion intrinsics +let Predicates = [HasAVX] in { + def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))), + (VCVTDQ2PDYrr VR128:$src)>; + def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))), + (VCVTDQ2PDYrm addr:$src)>; +} // Predicates = [HasAVX] + +// Convert packed double to packed single +// The assembler can recognize rr 256-bit instructions by seeing a ymm +// register, but the same isn't true when using memory operands instead. +// Provide other assembly rr and rm forms to address this explicitly. +def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtpd2ps\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))], + IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>; + +// XMM only +def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", + (VCVTPD2PSrr VR128:$dst, VR128:$src)>; +def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtpd2psx\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))], + IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>; + +// YMM only +def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), + "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_avx_cvt_pd2_ps_256 VR256:$src))], + IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>; +def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), + "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_avx_cvt_pd2_ps_256 (memopv4f64 addr:$src)))], + IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>; +def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}", + (VCVTPD2PSYrr VR128:$dst, VR256:$src)>; + +def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtpd2ps\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))], + IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>; +def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtpd2ps\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))], + IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>; + + +// AVX 256-bit register conversion intrinsics +// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below +// whenever possible to avoid declaring two versions of each one. +let Predicates = [HasAVX] in { + def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src), + (VCVTDQ2PSYrr VR256:$src)>; + def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (memopv4i64 addr:$src))), + (VCVTDQ2PSYrm addr:$src)>; + + // Match fround and fextend for 128/256-bit conversions + def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), + (VCVTPD2PSrr VR128:$src)>; + def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))), + (VCVTPD2PSXrm addr:$src)>; + def : Pat<(v4f32 (fround (v4f64 VR256:$src))), + (VCVTPD2PSYrr VR256:$src)>; + def : Pat<(v4f32 (fround (loadv4f64 addr:$src))), + (VCVTPD2PSYrm addr:$src)>; + + def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))), + (VCVTPS2PDrr VR128:$src)>; + def : Pat<(v4f64 (fextend (v4f32 VR128:$src))), + (VCVTPS2PDYrr VR128:$src)>; + def : Pat<(v4f64 (extloadv4f32 addr:$src)), + (VCVTPS2PDYrm addr:$src)>; +} + +let Predicates = [UseSSE2] in { + // Match fround and fextend for 128 conversions + def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), + (CVTPD2PSrr VR128:$src)>; + def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))), + (CVTPD2PSrm addr:$src)>; + + def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))), + (CVTPS2PDrr VR128:$src)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Compare Instructions +//===----------------------------------------------------------------------===// + +// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions +multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, + Operand CC, SDNode OpNode, ValueType VT, + PatFrag ld_frag, string asm, string asm_alt, + OpndItins itins> { + def rr : SIi8<0xC2, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, + [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))], + itins.rr>, Sched<[itins.Sched]>; + def rm : SIi8<0xC2, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, + [(set RC:$dst, (OpNode (VT RC:$src1), + (ld_frag addr:$src2), imm:$cc))], + itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; + + // Accept explicit immediate argument form instead of comparison code. + let neverHasSideEffects = 1 in { + def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, [], + IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>; + let mayLoad = 1 in + def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, i8imm:$cc), asm_alt, [], + IIC_SSE_ALU_F32S_RM>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; + } +} + +defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmpss, f32, loadf32, + "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SSE_ALU_F32S>, + XS, VEX_4V, VEX_LIG; +defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmpsd, f64, loadf64, + "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SSE_ALU_F32S>, // same latency as 32 bit compare + XD, VEX_4V, VEX_LIG; + +let Constraints = "$src1 = $dst" in { + defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmpss, f32, loadf32, + "cmp${cc}ss\t{$src2, $dst|$dst, $src2}", + "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S>, + XS; + defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmpsd, f64, loadf64, + "cmp${cc}sd\t{$src2, $dst|$dst, $src2}", + "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", + SSE_ALU_F32S>, // same latency as 32 bit compare + XD; +} + +multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC, + Intrinsic Int, string asm, OpndItins itins> { + def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src, CC:$cc), asm, + [(set VR128:$dst, (Int VR128:$src1, + VR128:$src, imm:$cc))], + itins.rr>, + Sched<[itins.Sched]>; + def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, x86memop:$src, CC:$cc), asm, + [(set VR128:$dst, (Int VR128:$src1, + (load addr:$src), imm:$cc))], + itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +// Aliases to match intrinsics which expect XMM operand(s). +defm Int_VCMPSS : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss, + "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}", + SSE_ALU_F32S>, + XS, VEX_4V; +defm Int_VCMPSD : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd, + "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}", + SSE_ALU_F32S>, // same latency as f32 + XD, VEX_4V; +let Constraints = "$src1 = $dst" in { + defm Int_CMPSS : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss, + "cmp${cc}ss\t{$src, $dst|$dst, $src}", + SSE_ALU_F32S>, XS; + defm Int_CMPSD : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd, + "cmp${cc}sd\t{$src, $dst|$dst, $src}", + SSE_ALU_F32S>, // same latency as f32 + XD; +} + + +// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS +multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, + ValueType vt, X86MemOperand x86memop, + PatFrag ld_frag, string OpcodeStr, Domain d> { + def rr: PI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))], + IIC_SSE_COMIS_RR, d>, + Sched<[WriteFAdd]>; + def rm: PI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (OpNode (vt RC:$src1), + (ld_frag addr:$src2)))], + IIC_SSE_COMIS_RM, d>, + Sched<[WriteFAddLd, ReadAfterLd]>; +} + +let Defs = [EFLAGS] in { + defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, + "ucomiss", SSEPackedSingle>, TB, VEX, VEX_LIG; + defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, + "ucomisd", SSEPackedDouble>, TB, OpSize, VEX, + VEX_LIG; + let Pattern = []<dag> in { + defm VCOMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, + "comiss", SSEPackedSingle>, TB, VEX, + VEX_LIG; + defm VCOMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, + "comisd", SSEPackedDouble>, TB, OpSize, VEX, + VEX_LIG; + } + + defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, + load, "ucomiss", SSEPackedSingle>, TB, VEX; + defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, + load, "ucomisd", SSEPackedDouble>, TB, OpSize, VEX; + + defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, + load, "comiss", SSEPackedSingle>, TB, VEX; + defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, + load, "comisd", SSEPackedDouble>, TB, OpSize, VEX; + defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, + "ucomiss", SSEPackedSingle>, TB; + defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, + "ucomisd", SSEPackedDouble>, TB, OpSize; + + let Pattern = []<dag> in { + defm COMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, + "comiss", SSEPackedSingle>, TB; + defm COMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, + "comisd", SSEPackedDouble>, TB, OpSize; + } + + defm Int_UCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, + load, "ucomiss", SSEPackedSingle>, TB; + defm Int_UCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, + load, "ucomisd", SSEPackedDouble>, TB, OpSize; + + defm Int_COMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load, + "comiss", SSEPackedSingle>, TB; + defm Int_COMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load, + "comisd", SSEPackedDouble>, TB, OpSize; +} // Defs = [EFLAGS] + +// sse12_cmp_packed - sse 1 & 2 compare packed instructions +multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, + Operand CC, Intrinsic Int, string asm, + string asm_alt, Domain d> { + def rri : PIi8<0xC2, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, + [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))], + IIC_SSE_CMPP_RR, d>, + Sched<[WriteFAdd]>; + def rmi : PIi8<0xC2, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, + [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))], + IIC_SSE_CMPP_RM, d>, + Sched<[WriteFAddLd, ReadAfterLd]>; + + // Accept explicit immediate argument form instead of comparison code. + let neverHasSideEffects = 1 in { + def rri_alt : PIi8<0xC2, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), + asm_alt, [], IIC_SSE_CMPP_RR, d>, Sched<[WriteFAdd]>; + def rmi_alt : PIi8<0xC2, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), + asm_alt, [], IIC_SSE_CMPP_RM, d>, + Sched<[WriteFAddLd, ReadAfterLd]>; + } +} + +defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps, + "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SSEPackedSingle>, TB, VEX_4V; +defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd, + "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SSEPackedDouble>, TB, OpSize, VEX_4V; +defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256, + "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SSEPackedSingle>, TB, VEX_4V, VEX_L; +defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256, + "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L; +let Constraints = "$src1 = $dst" in { + defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps, + "cmp${cc}ps\t{$src2, $dst|$dst, $src2}", + "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", + SSEPackedSingle>, TB; + defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd, + "cmp${cc}pd\t{$src2, $dst|$dst, $src2}", + "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", + SSEPackedDouble>, TB, OpSize; +} + +let Predicates = [HasAVX] in { +def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), + (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>; +def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)), + (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>; +def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), + (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; +def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)), + (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; + +def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)), + (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>; +def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (memop addr:$src2), imm:$cc)), + (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>; +def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)), + (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>; +def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)), + (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>; +} + +let Predicates = [UseSSE1] in { +def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), + (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>; +def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)), + (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>; +} + +let Predicates = [UseSSE2] in { +def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), + (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; +def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)), + (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Shuffle Instructions +//===----------------------------------------------------------------------===// + +/// sse12_shuffle - sse 1 & 2 shuffle instructions +multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, + ValueType vt, string asm, PatFrag mem_frag, + Domain d, bit IsConvertibleToThreeAddress = 0> { + def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, i8imm:$src3), asm, + [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), + (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, + Sched<[WriteShuffleLd, ReadAfterLd]>; + let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in + def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, i8imm:$src3), asm, + [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, + (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, + Sched<[WriteShuffle]>; +} + +defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, + "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + memopv4f32, SSEPackedSingle>, TB, VEX_4V; +defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, + "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + memopv8f32, SSEPackedSingle>, TB, VEX_4V, VEX_L; +defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, + "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}", + memopv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V; +defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, + "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}", + memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L; + +let Constraints = "$src1 = $dst" in { + defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, + "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", + memopv4f32, SSEPackedSingle, 1 /* cvt to pshufd */>, + TB; + defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64, + "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", + memopv2f64, SSEPackedDouble, 1 /* cvt to pshufd */>, + TB, OpSize; +} + +let Predicates = [HasAVX] in { + def : Pat<(v4i32 (X86Shufp VR128:$src1, + (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))), + (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; + def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>; + + def : Pat<(v2i64 (X86Shufp VR128:$src1, + (memopv2i64 addr:$src2), (i8 imm:$imm))), + (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>; + def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>; + + // 256-bit patterns + def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>; + def : Pat<(v8i32 (X86Shufp VR256:$src1, + (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>; + + def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>; + def : Pat<(v4i64 (X86Shufp VR256:$src1, + (memopv4i64 addr:$src2), (i8 imm:$imm))), + (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>; +} + +let Predicates = [UseSSE1] in { + def : Pat<(v4i32 (X86Shufp VR128:$src1, + (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))), + (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; + def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>; +} + +let Predicates = [UseSSE2] in { + // Generic SHUFPD patterns + def : Pat<(v2i64 (X86Shufp VR128:$src1, + (memopv2i64 addr:$src2), (i8 imm:$imm))), + (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>; + def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Unpack Instructions +//===----------------------------------------------------------------------===// + +/// sse12_unpack_interleave - sse 1 & 2 unpack and interleave +multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, + PatFrag mem_frag, RegisterClass RC, + X86MemOperand x86memop, string asm, + Domain d> { + def rr : PI<opc, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2), + asm, [(set RC:$dst, + (vt (OpNode RC:$src1, RC:$src2)))], + IIC_SSE_UNPCK, d>, Sched<[WriteShuffle]>; + def rm : PI<opc, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + asm, [(set RC:$dst, + (vt (OpNode RC:$src1, + (mem_frag addr:$src2))))], + IIC_SSE_UNPCK, d>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32, + VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, TB, VEX_4V; +defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64, + VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, TB, OpSize, VEX_4V; +defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32, + VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, TB, VEX_4V; +defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64, + VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, TB, OpSize, VEX_4V; + +defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, memopv8f32, + VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, TB, VEX_4V, VEX_L; +defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, memopv4f64, + VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L; +defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, memopv8f32, + VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, TB, VEX_4V, VEX_L; +defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, memopv4f64, + VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L; + +let Constraints = "$src1 = $dst" in { + defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32, + VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", + SSEPackedSingle>, TB; + defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64, + VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}", + SSEPackedDouble>, TB, OpSize; + defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32, + VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}", + SSEPackedSingle>, TB; + defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64, + VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}", + SSEPackedDouble>, TB, OpSize; +} // Constraints = "$src1 = $dst" + +let Predicates = [HasAVX1Only] in { + def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))), + (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), + (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))), + (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), + (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(v4i64 (X86Unpckl VR256:$src1, (memopv4i64 addr:$src2))), + (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; + def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)), + (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v4i64 (X86Unpckh VR256:$src1, (memopv4i64 addr:$src2))), + (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; + def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)), + (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; +} + +let Predicates = [HasAVX] in { + // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the + // problem is during lowering, where it's not possible to recognize the load + // fold cause it has two uses through a bitcast. One use disappears at isel + // time and the fold opportunity reappears. + def : Pat<(v2f64 (X86Movddup VR128:$src)), + (VUNPCKLPDrr VR128:$src, VR128:$src)>; +} + +let Predicates = [UseSSE2] in { + // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the + // problem is during lowering, where it's not possible to recognize the load + // fold cause it has two uses through a bitcast. One use disappears at isel + // time and the fold opportunity reappears. + def : Pat<(v2f64 (X86Movddup VR128:$src)), + (UNPCKLPDrr VR128:$src, VR128:$src)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Extract Floating-Point Sign mask +//===----------------------------------------------------------------------===// + +/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave +multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm, + Domain d> { + def rr32 : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins RC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set GR32:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>, + Sched<[WriteVecLogic]>; + def rr64 : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins RC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], + IIC_SSE_MOVMSK, d>, REX_W, Sched<[WriteVecLogic]>; +} + +let Predicates = [HasAVX] in { + defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, + "movmskps", SSEPackedSingle>, TB, VEX; + defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, + "movmskpd", SSEPackedDouble>, TB, + OpSize, VEX; + defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256, + "movmskps", SSEPackedSingle>, TB, + VEX, VEX_L; + defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256, + "movmskpd", SSEPackedDouble>, TB, + OpSize, VEX, VEX_L; + + def : Pat<(i32 (X86fgetsign FR32:$src)), + (VMOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(i64 (X86fgetsign FR32:$src)), + (VMOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(i32 (X86fgetsign FR64:$src)), + (VMOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(i64 (X86fgetsign FR64:$src)), + (VMOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>; + + // Assembler Only + def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), + "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK, + SSEPackedSingle>, TB, VEX, Sched<[WriteVecLogic]>; + def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), + "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK, + SSEPackedDouble>, TB, + OpSize, VEX, Sched<[WriteVecLogic]>; + def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), + "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK, + SSEPackedSingle>, TB, VEX, VEX_L, Sched<[WriteVecLogic]>; + def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), + "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK, + SSEPackedDouble>, TB, + OpSize, VEX, VEX_L, Sched<[WriteVecLogic]>; +} + +defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps", + SSEPackedSingle>, TB; +defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd", + SSEPackedDouble>, TB, OpSize; + +def : Pat<(i32 (X86fgetsign FR32:$src)), + (MOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>, + Requires<[UseSSE1]>; +def : Pat<(i64 (X86fgetsign FR32:$src)), + (MOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>, + Requires<[UseSSE1]>; +def : Pat<(i32 (X86fgetsign FR64:$src)), + (MOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>, + Requires<[UseSSE2]>; +def : Pat<(i64 (X86fgetsign FR64:$src)), + (MOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>, + Requires<[UseSSE2]>; + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Logical Instructions +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { // SSE integer instructions + +/// PDI_binop_rm - Simple SSE2 binary operator. +multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, OpndItins itins, + bit IsCommutable, bit Is2Addr> { + let isCommutable = IsCommutable in + def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>, + Sched<[itins.Sched]>; + def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, + (bitconvert (memop_frag addr:$src2)))))], + itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} +} // ExeDomain = SSEPackedInt + +multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, + ValueType OpVT128, ValueType OpVT256, + OpndItins itins, bit IsCommutable = 0> { +let Predicates = [HasAVX] in + defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, + VR128, memopv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V; + +let Constraints = "$src1 = $dst" in + defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, + memopv2i64, i128mem, itins, IsCommutable, 1>; + +let Predicates = [HasAVX2] in + defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, + OpVT256, VR256, memopv4i64, i256mem, itins, + IsCommutable, 0>, VEX_4V, VEX_L; +} + +// These are ordered here for pattern ordering requirements with the fp versions + +defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, SSE_BIT_ITINS_P, 1>; +defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, SSE_BIT_ITINS_P, 1>; +defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, SSE_BIT_ITINS_P, 1>; +defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, + SSE_BIT_ITINS_P, 0>; + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Logical Instructions +//===----------------------------------------------------------------------===// + +/// sse12_fp_alias_pack_logical - SSE 1 & 2 aliased packed FP logical ops +/// +multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr, + SDNode OpNode, OpndItins itins> { + defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, + FR32, f32, f128mem, memopfsf32, SSEPackedSingle, itins, 0>, + TB, VEX_4V; + + defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, + FR64, f64, f128mem, memopfsf64, SSEPackedDouble, itins, 0>, + TB, OpSize, VEX_4V; + + let Constraints = "$src1 = $dst" in { + defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32, + f32, f128mem, memopfsf32, SSEPackedSingle, itins>, + TB; + + defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64, + f64, f128mem, memopfsf64, SSEPackedDouble, itins>, + TB, OpSize; + } +} + +// Alias bitwise logical operations using SSE logical ops on packed FP values. +defm FsAND : sse12_fp_alias_pack_logical<0x54, "and", X86fand, + SSE_BIT_ITINS_P>; +defm FsOR : sse12_fp_alias_pack_logical<0x56, "or", X86for, + SSE_BIT_ITINS_P>; +defm FsXOR : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor, + SSE_BIT_ITINS_P>; + +let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in + defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef, + SSE_BIT_ITINS_P>; + +/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops +/// +multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, + !strconcat(OpcodeStr, "ps"), f256mem, + [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))], + [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)), + (memopv4i64 addr:$src2)))], 0>, TB, VEX_4V, VEX_L; + + defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, + !strconcat(OpcodeStr, "pd"), f256mem, + [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)), + (bc_v4i64 (v4f64 VR256:$src2))))], + [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)), + (memopv4i64 addr:$src2)))], 0>, + TB, OpSize, VEX_4V, VEX_L; + + // In AVX no need to add a pattern for 128-bit logical rr ps, because they + // are all promoted to v2i64, and the patterns are covered by the int + // version. This is needed in SSE only, because v2i64 isn't supported on + // SSE1, but only on SSE2. + defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, + !strconcat(OpcodeStr, "ps"), f128mem, [], + [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), + (memopv2i64 addr:$src2)))], 0>, TB, VEX_4V; + + defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, + !strconcat(OpcodeStr, "pd"), f128mem, + [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), + (bc_v2i64 (v2f64 VR128:$src2))))], + [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), + (memopv2i64 addr:$src2)))], 0>, + TB, OpSize, VEX_4V; + + let Constraints = "$src1 = $dst" in { + defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, + !strconcat(OpcodeStr, "ps"), f128mem, + [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))], + [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), + (memopv2i64 addr:$src2)))]>, TB; + + defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, + !strconcat(OpcodeStr, "pd"), f128mem, + [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), + (bc_v2i64 (v2f64 VR128:$src2))))], + [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), + (memopv2i64 addr:$src2)))]>, TB, OpSize; + } +} + +defm AND : sse12_fp_packed_logical<0x54, "and", and>; +defm OR : sse12_fp_packed_logical<0x56, "or", or>; +defm XOR : sse12_fp_packed_logical<0x57, "xor", xor>; +let isCommutable = 0 in + defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>; + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Arithmetic Instructions +//===----------------------------------------------------------------------===// + +/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and +/// vector forms. +/// +/// In addition, we also have a special variant of the scalar form here to +/// represent the associated intrinsic operation. This form is unlike the +/// plain scalar form, in that it takes an entire vector (instead of a scalar) +/// and leaves the top elements unmodified (therefore these cannot be commuted). +/// +/// These three forms can each be reg+reg or reg+mem. +/// + +/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those +/// classes below +multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, + SizeItins itins, + bit Is2Addr = 1> { + defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), + OpNode, FR32, f32mem, + itins.s, Is2Addr>, XS; + defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), + OpNode, FR64, f64mem, + itins.d, Is2Addr>, XD; +} + +multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, + SDNode OpNode, SizeItins itins> { +let Predicates = [HasAVX] in { + defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, + VR128, v4f32, f128mem, memopv4f32, + SSEPackedSingle, itins.s, 0>, TB, VEX_4V; + defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, + VR128, v2f64, f128mem, memopv2f64, + SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V; + + defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), + OpNode, VR256, v8f32, f256mem, memopv8f32, + SSEPackedSingle, itins.s, 0>, TB, VEX_4V, VEX_L; + defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), + OpNode, VR256, v4f64, f256mem, memopv4f64, + SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V, VEX_L; +} + +let Constraints = "$src1 = $dst" in { + defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, + v4f32, f128mem, memopv4f32, SSEPackedSingle, + itins.s, 1>, TB; + defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, + v2f64, f128mem, memopv2f64, SSEPackedDouble, + itins.d, 1>, TB, OpSize; +} +} + +multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, + SizeItins itins, + bit Is2Addr = 1> { + defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128, + !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32, + itins.s, Is2Addr>, XS; + defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128, + !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64, + itins.d, Is2Addr>, XD; +} + +// Binary Arithmetic instructions +defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>; +defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>; +let isCommutable = 0 in { + defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>; + defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>; + defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>; + defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>; +} + +let isCodeGenOnly = 1 in { + defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>; + defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>; +} + +defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S, 0>, + basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S, 0>, + VEX_4V, VEX_LIG; +defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S, 0>, + basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S, 0>, + VEX_4V, VEX_LIG; + +let isCommutable = 0 in { + defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S, 0>, + basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S, 0>, + VEX_4V, VEX_LIG; + defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S, 0>, + basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S, 0>, + VEX_4V, VEX_LIG; + defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S, 0>, + basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S, 0>, + VEX_4V, VEX_LIG; + defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S, 0>, + basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S, 0>, + VEX_4V, VEX_LIG; +} + +let Constraints = "$src1 = $dst" in { + defm ADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>, + basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>; + defm MUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>, + basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>; + + let isCommutable = 0 in { + defm SUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>, + basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>; + defm DIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>, + basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>; + defm MAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>, + basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>; + defm MIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>, + basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>; + } +} + +let isCodeGenOnly = 1 in { + defm VMAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S, 0>, + VEX_4V, VEX_LIG; + defm VMINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S, 0>, + VEX_4V, VEX_LIG; + let Constraints = "$src1 = $dst" in { + defm MAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>; + defm MINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>; + } +} + +/// Unop Arithmetic +/// In addition, we also have a special variant of the scalar form here to +/// represent the associated intrinsic operation. This form is unlike the +/// plain scalar form, in that it takes an entire vector (instead of a +/// scalar) and leaves the top elements undefined. +/// +/// And, we have a special variant form for a full-vector intrinsic form. + +let Sched = WriteFSqrt in { +def SSE_SQRTP : OpndItins< + IIC_SSE_SQRTP_RR, IIC_SSE_SQRTP_RM +>; + +def SSE_SQRTS : OpndItins< + IIC_SSE_SQRTS_RR, IIC_SSE_SQRTS_RM +>; +} + +let Sched = WriteFRcp in { +def SSE_RCPP : OpndItins< + IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM +>; + +def SSE_RCPS : OpndItins< + IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM +>; +} + +/// sse1_fp_unop_s - SSE1 unops in scalar form. +multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, + SDNode OpNode, Intrinsic F32Int, OpndItins itins> { +let Predicates = [HasAVX], hasSideEffects = 0 in { + def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), + (ins FR32:$src1, FR32:$src2), + !strconcat("v", OpcodeStr, + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>; + let mayLoad = 1 in { + def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), + (ins FR32:$src1,f32mem:$src2), + !strconcat("v", OpcodeStr, + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX_4V, VEX_LIG, + Sched<[itins.Sched.Folded, ReadAfterLd]>; + def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, ssmem:$src2), + !strconcat("v", OpcodeStr, + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX_4V, VEX_LIG, + Sched<[itins.Sched.Folded, ReadAfterLd]>; + } +} + + def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src), + !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), + [(set FR32:$dst, (OpNode FR32:$src))]>, Sched<[itins.Sched]>; + // For scalar unary operations, fold a load into the operation + // only in OptForSize mode. It eliminates an instruction, but it also + // eliminates a whole-register clobber (the load), so it introduces a + // partial register update condition. + def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src), + !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), + [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS, + Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>; + def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (F32Int VR128:$src))], itins.rr>, + Sched<[itins.Sched]>; + def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src), + !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (F32Int sse_load_f32:$src))], itins.rm>, + Sched<[itins.Sched.Folded]>; +} + +/// sse1_fp_unop_s_rw - SSE1 unops where vector form has a read-write operand. +multiclass sse1_fp_unop_rw<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins> { +let Predicates = [HasAVX], hasSideEffects = 0 in { + def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), + (ins FR32:$src1, FR32:$src2), + !strconcat("v", OpcodeStr, + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>; + let mayLoad = 1 in { + def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), + (ins FR32:$src1,f32mem:$src2), + !strconcat("v", OpcodeStr, + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX_4V, VEX_LIG, + Sched<[itins.Sched.Folded, ReadAfterLd]>; + def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, ssmem:$src2), + !strconcat("v", OpcodeStr, + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX_4V, VEX_LIG, + Sched<[itins.Sched.Folded, ReadAfterLd]>; + } +} + + def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src), + !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), + [(set FR32:$dst, (OpNode FR32:$src))]>, Sched<[itins.Sched]>; + // For scalar unary operations, fold a load into the operation + // only in OptForSize mode. It eliminates an instruction, but it also + // eliminates a whole-register clobber (the load), so it introduces a + // partial register update condition. + def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src), + !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), + [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS, + Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>; + let Constraints = "$src1 = $dst" in { + def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"), + [], itins.rr>, Sched<[itins.Sched]>; + let mayLoad = 1, hasSideEffects = 0 in + def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, ssmem:$src2), + !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"), + [], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; + } +} + +/// sse1_fp_unop_p - SSE1 unops in packed form. +multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins> { +let Predicates = [HasAVX] in { + def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat("v", OpcodeStr, + "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], + itins.rr>, VEX, Sched<[itins.Sched]>; + def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat("v", OpcodeStr, + "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], + itins.rm>, VEX, Sched<[itins.Sched.Folded]>; + def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat("v", OpcodeStr, + "ps\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))], + itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>; + def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + !strconcat("v", OpcodeStr, + "ps\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))], + itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>; +} + + def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>, + Sched<[itins.Sched]>; + def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>, + Sched<[itins.Sched.Folded]>; +} + +/// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms. +multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr, + Intrinsic V4F32Int, Intrinsic V8F32Int, + OpndItins itins> { +let Predicates = [HasAVX] in { + def V#NAME#PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat("v", OpcodeStr, + "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (V4F32Int VR128:$src))], + itins.rr>, VEX, Sched<[itins.Sched]>; + def V#NAME#PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat("v", OpcodeStr, + "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))], + itins.rm>, VEX, Sched<[itins.Sched.Folded]>; + def V#NAME#PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat("v", OpcodeStr, + "ps\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (V8F32Int VR256:$src))], + itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>; + def V#NAME#PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst), + (ins f256mem:$src), + !strconcat("v", OpcodeStr, + "ps\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (V8F32Int (memopv8f32 addr:$src)))], + itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>; +} + + def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (V4F32Int VR128:$src))], + itins.rr>, Sched<[itins.Sched]>; + def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))], + itins.rm>, Sched<[itins.Sched.Folded]>; +} + +/// sse2_fp_unop_s - SSE2 unops in scalar form. +multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, + SDNode OpNode, Intrinsic F64Int, OpndItins itins> { +let Predicates = [HasAVX], hasSideEffects = 0 in { + def V#NAME#SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), + (ins FR64:$src1, FR64:$src2), + !strconcat("v", OpcodeStr, + "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>; + let mayLoad = 1 in { + def V#NAME#SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), + (ins FR64:$src1,f64mem:$src2), + !strconcat("v", OpcodeStr, + "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX_4V, VEX_LIG, + Sched<[itins.Sched.Folded, ReadAfterLd]>; + def V#NAME#SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, sdmem:$src2), + !strconcat("v", OpcodeStr, + "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX_4V, VEX_LIG, + Sched<[itins.Sched.Folded, ReadAfterLd]>; + } +} + + def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src), + !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), + [(set FR64:$dst, (OpNode FR64:$src))], itins.rr>, + Sched<[itins.Sched]>; + // See the comments in sse1_fp_unop_s for why this is OptForSize. + def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src), + !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), + [(set FR64:$dst, (OpNode (load addr:$src)))], itins.rm>, XD, + Requires<[UseSSE2, OptForSize]>, Sched<[itins.Sched.Folded]>; + def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>, + Sched<[itins.Sched]>; + def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src), + !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (F64Int sse_load_f64:$src))], itins.rm>, + Sched<[itins.Sched.Folded]>; +} + +/// sse2_fp_unop_p - SSE2 unops in vector forms. +multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, + SDNode OpNode, OpndItins itins> { +let Predicates = [HasAVX] in { + def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat("v", OpcodeStr, + "pd\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], + itins.rr>, VEX, Sched<[itins.Sched]>; + def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat("v", OpcodeStr, + "pd\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], + itins.rm>, VEX, Sched<[itins.Sched.Folded]>; + def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat("v", OpcodeStr, + "pd\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))], + itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>; + def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + !strconcat("v", OpcodeStr, + "pd\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))], + itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>; +} + + def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>, + Sched<[itins.Sched]>; + def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>, + Sched<[itins.Sched.Folded]>; +} + +// Square root. +defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss, + SSE_SQRTS>, + sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTP>, + sse2_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd, + SSE_SQRTS>, + sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTP>; + +// Reciprocal approximations. Note that these typically require refinement +// in order to obtain suitable precision. +defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTS>, + sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTP>, + sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps, + int_x86_avx_rsqrt_ps_256, SSE_SQRTP>; +defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>, + sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>, + sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, + int_x86_avx_rcp_ps_256, SSE_RCPP>; + +def : Pat<(f32 (fsqrt FR32:$src)), + (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; +def : Pat<(f32 (fsqrt (load addr:$src))), + (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasAVX, OptForSize]>; +def : Pat<(f64 (fsqrt FR64:$src)), + (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>; +def : Pat<(f64 (fsqrt (load addr:$src))), + (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasAVX, OptForSize]>; + +def : Pat<(f32 (X86frsqrt FR32:$src)), + (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; +def : Pat<(f32 (X86frsqrt (load addr:$src))), + (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasAVX, OptForSize]>; + +def : Pat<(f32 (X86frcp FR32:$src)), + (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; +def : Pat<(f32 (X86frcp (load addr:$src))), + (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasAVX, OptForSize]>; + +let Predicates = [HasAVX] in { + def : Pat<(int_x86_sse_sqrt_ss VR128:$src), + (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS VR128:$src, FR32)), + VR128)>; + def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src), + (VSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; + + def : Pat<(int_x86_sse2_sqrt_sd VR128:$src), + (COPY_TO_REGCLASS (VSQRTSDr (f64 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS VR128:$src, FR64)), + VR128)>; + def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src), + (VSQRTSDm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>; + + def : Pat<(int_x86_sse_rsqrt_ss VR128:$src), + (COPY_TO_REGCLASS (VRSQRTSSr (f32 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS VR128:$src, FR32)), + VR128)>; + def : Pat<(int_x86_sse_rsqrt_ss sse_load_f32:$src), + (VRSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; + + def : Pat<(int_x86_sse_rcp_ss VR128:$src), + (COPY_TO_REGCLASS (VRCPSSr (f32 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS VR128:$src, FR32)), + VR128)>; + def : Pat<(int_x86_sse_rcp_ss sse_load_f32:$src), + (VRCPSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; +} + +// Reciprocal approximations. Note that these typically require refinement +// in order to obtain suitable precision. +let Predicates = [UseSSE1] in { + def : Pat<(int_x86_sse_rsqrt_ss VR128:$src), + (RSQRTSSr_Int VR128:$src, VR128:$src)>; + def : Pat<(int_x86_sse_rcp_ss VR128:$src), + (RCPSSr_Int VR128:$src, VR128:$src)>; +} + +// There is no f64 version of the reciprocal approximation instructions. + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Non-temporal stores +//===----------------------------------------------------------------------===// + +let AddedComplexity = 400 in { // Prefer non-temporal versions +let SchedRW = [WriteStore] in { +def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movntps\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f32 VR128:$src), + addr:$dst)], + IIC_SSE_MOVNT>, VEX; +def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movntpd\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v2f64 VR128:$src), + addr:$dst)], + IIC_SSE_MOVNT>, VEX; + +let ExeDomain = SSEPackedInt in +def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v2i64 VR128:$src), + addr:$dst)], + IIC_SSE_MOVNT>, VEX; + +def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src), + "movntps\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v8f32 VR256:$src), + addr:$dst)], + IIC_SSE_MOVNT>, VEX, VEX_L; +def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src), + "movntpd\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f64 VR256:$src), + addr:$dst)], + IIC_SSE_MOVNT>, VEX, VEX_L; +let ExeDomain = SSEPackedInt in +def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4i64 VR256:$src), + addr:$dst)], + IIC_SSE_MOVNT>, VEX, VEX_L; + +def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movntps\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)], + IIC_SSE_MOVNT>; +def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movntpd\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)], + IIC_SSE_MOVNT>; + +let ExeDomain = SSEPackedInt in +def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)], + IIC_SSE_MOVNT>; + +// There is no AVX form for instructions below this point +def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "movnti{l}\t{$src, $dst|$dst, $src}", + [(nontemporalstore (i32 GR32:$src), addr:$dst)], + IIC_SSE_MOVNT>, + TB, Requires<[HasSSE2]>; +def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "movnti{q}\t{$src, $dst|$dst, $src}", + [(nontemporalstore (i64 GR64:$src), addr:$dst)], + IIC_SSE_MOVNT>, + TB, Requires<[HasSSE2]>; +} // SchedRW = [WriteStore] + +def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst), + (VMOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasAVX]>; + +def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst), + (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>; +} // AddedComplexity + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Prefetch and memory fence +//===----------------------------------------------------------------------===// + +// Prefetch intrinsic. +let Predicates = [HasSSE1], SchedRW = [WriteLoad] in { +def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src), + "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))], + IIC_SSE_PREFETCH>, TB; +def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src), + "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))], + IIC_SSE_PREFETCH>, TB; +def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src), + "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))], + IIC_SSE_PREFETCH>, TB; +def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src), + "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))], + IIC_SSE_PREFETCH>, TB; +} + +// FIXME: How should these memory instructions be modeled? +let SchedRW = [WriteLoad] in { +// Flush cache +def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src), + "clflush\t$src", [(int_x86_sse2_clflush addr:$src)], + IIC_SSE_PREFETCH>, TB, Requires<[HasSSE2]>; + +// Pause. This "instruction" is encoded as "rep; nop", so even though it +// was introduced with SSE2, it's backward compatible. +def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", [], IIC_SSE_PAUSE>, REP; + +// Load, store, and memory fence +def SFENCE : I<0xAE, MRM_F8, (outs), (ins), + "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>, + TB, Requires<[HasSSE1]>; +def LFENCE : I<0xAE, MRM_E8, (outs), (ins), + "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>, + TB, Requires<[HasSSE2]>; +def MFENCE : I<0xAE, MRM_F0, (outs), (ins), + "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>, + TB, Requires<[HasSSE2]>; +} // SchedRW + +def : Pat<(X86SFence), (SFENCE)>; +def : Pat<(X86LFence), (LFENCE)>; +def : Pat<(X86MFence), (MFENCE)>; + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Load/Store XCSR register +//===----------------------------------------------------------------------===// + +def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), + "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)], + IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>; +def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), + "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)], + IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>; + +def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src), + "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)], + IIC_SSE_LDMXCSR>, Sched<[WriteLoad]>; +def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), + "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)], + IIC_SSE_STMXCSR>, Sched<[WriteStore]>; + +//===---------------------------------------------------------------------===// +// SSE2 - Move Aligned/Unaligned Packed Integer Instructions +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { // SSE integer instructions + +let neverHasSideEffects = 1, SchedRW = [WriteMove] in { +def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, + VEX; +def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, + VEX, VEX_L; +def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>, + VEX; +def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>, + VEX, VEX_L; +} + +// For Disassembler +let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in { +def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>, + VEX; +def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), + "movdqa\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>, VEX, VEX_L; +def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>, + VEX; +def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), + "movdqu\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>, VEX, VEX_L; +} + +let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, + neverHasSideEffects = 1, SchedRW = [WriteLoad] in { +def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>, + VEX; +def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>, + VEX, VEX_L; +let Predicates = [HasAVX] in { + def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>, + XS, VEX; + def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>, + XS, VEX, VEX_L; +} +} + +let mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in { +def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), + (ins i128mem:$dst, VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>, + VEX; +def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), + (ins i256mem:$dst, VR256:$src), + "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>, + VEX, VEX_L; +let Predicates = [HasAVX] in { +def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>, + XS, VEX; +def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>, + XS, VEX, VEX_L; +} +} + +let SchedRW = [WriteMove] in { +let neverHasSideEffects = 1 in +def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>; + +def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", + [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>; + +// For Disassembler +let isCodeGenOnly = 1, hasSideEffects = 0 in { +def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>; + +def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", + [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>; +} +} // SchedRW + +let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, + neverHasSideEffects = 1, SchedRW = [WriteLoad] in { +def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "movdqa\t{$src, $dst|$dst, $src}", + [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/], + IIC_SSE_MOVA_P_RM>; +def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "movdqu\t{$src, $dst|$dst, $src}", + [/*(set VR128:$dst, (loadv2i64 addr:$src))*/], + IIC_SSE_MOVU_P_RM>, + XS, Requires<[UseSSE2]>; +} + +let mayStore = 1, SchedRW = [WriteStore] in { +def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", + [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/], + IIC_SSE_MOVA_P_MR>; +def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", + [/*(store (v2i64 VR128:$src), addr:$dst)*/], + IIC_SSE_MOVU_P_MR>, + XS, Requires<[UseSSE2]>; +} + +} // ExeDomain = SSEPackedInt + +let Predicates = [HasAVX] in { + def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src), + (VMOVDQUmr addr:$dst, VR128:$src)>; + def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src), + (VMOVDQUYmr addr:$dst, VR256:$src)>; +} +let Predicates = [UseSSE2] in +def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src), + (MOVDQUmr addr:$dst, VR128:$src)>; + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Arithmetic Instructions +//===---------------------------------------------------------------------===// + +let Sched = WriteVecIMul in +def SSE_PMADD : OpndItins< + IIC_SSE_PMADD, IIC_SSE_PMADD +>; + +let ExeDomain = SSEPackedInt in { // SSE integer instructions + +multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId, + RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, + OpndItins itins, + bit IsCommutable = 0, + bit Is2Addr = 1> { + let isCommutable = IsCommutable in + def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (IntId RC:$src1, RC:$src2))], itins.rr>, + Sched<[itins.Sched]>; + def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2))))], + itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128, + Intrinsic IntId256, OpndItins itins, + bit IsCommutable = 0> { +let Predicates = [HasAVX] in + defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128, + VR128, memopv2i64, i128mem, itins, + IsCommutable, 0>, VEX_4V; + +let Constraints = "$src1 = $dst" in + defm NAME : PDI_binop_rm_int<opc, OpcodeStr, IntId128, VR128, memopv2i64, + i128mem, itins, IsCommutable, 1>; + +let Predicates = [HasAVX2] in + defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256, + VR256, memopv4i64, i256mem, itins, + IsCommutable, 0>, VEX_4V, VEX_L; +} + +multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, + string OpcodeStr, SDNode OpNode, + SDNode OpNode2, RegisterClass RC, + ValueType DstVT, ValueType SrcVT, PatFrag bc_frag, + ShiftOpndItins itins, + bit Is2Addr = 1> { + // src2 is always 128-bit + def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))], + itins.rr>, Sched<[WriteVecShift]>; + def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode RC:$src1, + (bc_frag (memopv2i64 addr:$src2)))))], itins.rm>, + Sched<[WriteVecShiftLd, ReadAfterLd]>; + def ri : PDIi8<opc2, ImmForm, (outs RC:$dst), + (ins RC:$src1, i32i8imm:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i32 imm:$src2))))], itins.ri>, + Sched<[WriteVecShift]>; +} + +/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types +multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType DstVT, ValueType SrcVT, RegisterClass RC, + PatFrag memop_frag, X86MemOperand x86memop, + OpndItins itins, + bit IsCommutable = 0, bit Is2Addr = 1> { + let isCommutable = IsCommutable in + def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>, + Sched<[itins.Sched]>; + def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), + (bitconvert (memop_frag addr:$src2)))))]>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} +} // ExeDomain = SSEPackedInt + +defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8, + SSE_INTALU_ITINS_P, 1>; +defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16, + SSE_INTALU_ITINS_P, 1>; +defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32, + SSE_INTALU_ITINS_P, 1>; +defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, + SSE_INTALUQ_ITINS_P, 1>; +defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, + SSE_INTMUL_ITINS_P, 1>; +defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8, + SSE_INTALU_ITINS_P, 0>; +defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16, + SSE_INTALU_ITINS_P, 0>; +defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32, + SSE_INTALU_ITINS_P, 0>; +defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64, + SSE_INTALUQ_ITINS_P, 0>; +defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8, + SSE_INTALU_ITINS_P, 0>; +defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16, + SSE_INTALU_ITINS_P, 0>; +defm PMINUB : PDI_binop_all<0xDA, "pminub", X86umin, v16i8, v32i8, + SSE_INTALU_ITINS_P, 1>; +defm PMINSW : PDI_binop_all<0xEA, "pminsw", X86smin, v8i16, v16i16, + SSE_INTALU_ITINS_P, 1>; +defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", X86umax, v16i8, v32i8, + SSE_INTALU_ITINS_P, 1>; +defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", X86smax, v8i16, v16i16, + SSE_INTALU_ITINS_P, 1>; + +// Intrinsic forms +defm PSUBSB : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b, + int_x86_avx2_psubs_b, SSE_INTALU_ITINS_P, 0>; +defm PSUBSW : PDI_binop_all_int<0xE9, "psubsw" , int_x86_sse2_psubs_w, + int_x86_avx2_psubs_w, SSE_INTALU_ITINS_P, 0>; +defm PADDSB : PDI_binop_all_int<0xEC, "paddsb" , int_x86_sse2_padds_b, + int_x86_avx2_padds_b, SSE_INTALU_ITINS_P, 1>; +defm PADDSW : PDI_binop_all_int<0xED, "paddsw" , int_x86_sse2_padds_w, + int_x86_avx2_padds_w, SSE_INTALU_ITINS_P, 1>; +defm PADDUSB : PDI_binop_all_int<0xDC, "paddusb", int_x86_sse2_paddus_b, + int_x86_avx2_paddus_b, SSE_INTALU_ITINS_P, 1>; +defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w, + int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>; +defm PMULHUW : PDI_binop_all_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w, + int_x86_avx2_pmulhu_w, SSE_INTMUL_ITINS_P, 1>; +defm PMULHW : PDI_binop_all_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w, + int_x86_avx2_pmulh_w, SSE_INTMUL_ITINS_P, 1>; +defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, + int_x86_avx2_pmadd_wd, SSE_PMADD, 1>; +defm PAVGB : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b, + int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>; +defm PAVGW : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w, + int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>; +defm PSADBW : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw, + int_x86_avx2_psad_bw, SSE_INTALU_ITINS_P, 1>; + +let Predicates = [HasAVX] in +defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128, + memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>, + VEX_4V; +let Predicates = [HasAVX2] in +defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32, + VR256, memopv4i64, i256mem, + SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L; +let Constraints = "$src1 = $dst" in +defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128, + memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>; + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Logical Instructions +//===---------------------------------------------------------------------===// + +let Predicates = [HasAVX] in { +defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, + VR128, v8i16, v8i16, bc_v8i16, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli, + VR128, v4i32, v4i32, bc_v4i32, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli, + VR128, v2i64, v2i64, bc_v2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; + +defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, + VR128, v8i16, v8i16, bc_v8i16, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli, + VR128, v4i32, v4i32, bc_v4i32, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli, + VR128, v2i64, v2i64, bc_v2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; + +defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, + VR128, v8i16, v8i16, bc_v8i16, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, + VR128, v4i32, v4i32, bc_v4i32, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; + +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { + // 128-bit logical shifts. + def VPSLLDQri : PDIi8<0x73, MRM7r, + (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), + "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>, + VEX_4V; + def VPSRLDQri : PDIi8<0x73, MRM3r, + (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), + "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>, + VEX_4V; + // PSRADQri doesn't exist in SSE[1-3]. +} +} // Predicates = [HasAVX] + +let Predicates = [HasAVX2] in { +defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, + VR256, v16i16, v8i16, bc_v8i16, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli, + VR256, v8i32, v4i32, bc_v4i32, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli, + VR256, v4i64, v2i64, bc_v2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; + +defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, + VR256, v16i16, v8i16, bc_v8i16, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli, + VR256, v8i32, v4i32, bc_v4i32, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli, + VR256, v4i64, v2i64, bc_v2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; + +defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, + VR256, v16i16, v8i16, bc_v8i16, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, + VR256, v8i32, v4i32, bc_v4i32, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; + +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { + // 256-bit logical shifts. + def VPSLLDQYri : PDIi8<0x73, MRM7r, + (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2), + "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR256:$dst, + (int_x86_avx2_psll_dq_bs VR256:$src1, imm:$src2))]>, + VEX_4V, VEX_L; + def VPSRLDQYri : PDIi8<0x73, MRM3r, + (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2), + "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR256:$dst, + (int_x86_avx2_psrl_dq_bs VR256:$src1, imm:$src2))]>, + VEX_4V, VEX_L; + // PSRADQYri doesn't exist in SSE[1-3]. +} +} // Predicates = [HasAVX2] + +let Constraints = "$src1 = $dst" in { +defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, + VR128, v8i16, v8i16, bc_v8i16, + SSE_INTSHIFT_ITINS_P>; +defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli, + VR128, v4i32, v4i32, bc_v4i32, + SSE_INTSHIFT_ITINS_P>; +defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli, + VR128, v2i64, v2i64, bc_v2i64, + SSE_INTSHIFT_ITINS_P>; + +defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli, + VR128, v8i16, v8i16, bc_v8i16, + SSE_INTSHIFT_ITINS_P>; +defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli, + VR128, v4i32, v4i32, bc_v4i32, + SSE_INTSHIFT_ITINS_P>; +defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli, + VR128, v2i64, v2i64, bc_v2i64, + SSE_INTSHIFT_ITINS_P>; + +defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai, + VR128, v8i16, v8i16, bc_v8i16, + SSE_INTSHIFT_ITINS_P>; +defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, + VR128, v4i32, v4i32, bc_v4i32, + SSE_INTSHIFT_ITINS_P>; + +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { + // 128-bit logical shifts. + def PSLLDQri : PDIi8<0x73, MRM7r, + (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), + "pslldq\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>; + def PSRLDQri : PDIi8<0x73, MRM3r, + (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), + "psrldq\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>; + // PSRADQri doesn't exist in SSE[1-3]. +} +} // Constraints = "$src1 = $dst" + +let Predicates = [HasAVX] in { + def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2), + (VPSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>; + def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2), + (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; + def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)), + (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; + + // Shift up / down and insert zero's. + def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))), + (VPSLLDQri VR128:$src, (BYTE_imm imm:$amt))>; + def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))), + (VPSRLDQri VR128:$src, (BYTE_imm imm:$amt))>; +} + +let Predicates = [HasAVX2] in { + def : Pat<(int_x86_avx2_psll_dq VR256:$src1, imm:$src2), + (VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; + def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2), + (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; +} + +let Predicates = [UseSSE2] in { + def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2), + (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>; + def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2), + (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; + def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)), + (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; + + // Shift up / down and insert zero's. + def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))), + (PSLLDQri VR128:$src, (BYTE_imm imm:$amt))>; + def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))), + (PSRLDQri VR128:$src, (BYTE_imm imm:$amt))>; +} + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Comparison Instructions +//===---------------------------------------------------------------------===// + +defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8, + SSE_INTALU_ITINS_P, 1>; +defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16, + SSE_INTALU_ITINS_P, 1>; +defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32, + SSE_INTALU_ITINS_P, 1>; +defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8, + SSE_INTALU_ITINS_P, 0>; +defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, + SSE_INTALU_ITINS_P, 0>; +defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, + SSE_INTALU_ITINS_P, 0>; + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Pack Instructions +//===---------------------------------------------------------------------===// + +defm PACKSSWB : PDI_binop_all_int<0x63, "packsswb", int_x86_sse2_packsswb_128, + int_x86_avx2_packsswb, SSE_INTALU_ITINS_P, 0>; +defm PACKSSDW : PDI_binop_all_int<0x6B, "packssdw", int_x86_sse2_packssdw_128, + int_x86_avx2_packssdw, SSE_INTALU_ITINS_P, 0>; +defm PACKUSWB : PDI_binop_all_int<0x67, "packuswb", int_x86_sse2_packuswb_128, + int_x86_avx2_packuswb, SSE_INTALU_ITINS_P, 0>; + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Shuffle Instructions +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { +multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256, + SDNode OpNode> { +let Predicates = [HasAVX] in { + def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, i8imm:$src2), + !strconcat("v", OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))], + IIC_SSE_PSHUF>, VEX, Sched<[WriteShuffle]>; + def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1, i8imm:$src2), + !strconcat("v", OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)), + (i8 imm:$src2))))], IIC_SSE_PSHUF>, VEX, + Sched<[WriteShuffleLd]>; +} + +let Predicates = [HasAVX2] in { + def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, i8imm:$src2), + !strconcat("v", OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))], + IIC_SSE_PSHUF>, VEX, VEX_L, Sched<[WriteShuffle]>; + def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), + (ins i256mem:$src1, i8imm:$src2), + !strconcat("v", OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (vt256 (OpNode (bitconvert (memopv4i64 addr:$src1)), + (i8 imm:$src2))))], IIC_SSE_PSHUF>, VEX, VEX_L, + Sched<[WriteShuffleLd]>; +} + +let Predicates = [UseSSE2] in { + def ri : Ii8<0x70, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))], + IIC_SSE_PSHUF>, Sched<[WriteShuffle]>; + def mi : Ii8<0x70, MRMSrcMem, + (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)), + (i8 imm:$src2))))], IIC_SSE_PSHUF>, + Sched<[WriteShuffleLd]>; +} +} +} // ExeDomain = SSEPackedInt + +defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd>, TB, OpSize; +defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw>, XS; +defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw>, XD; + +let Predicates = [HasAVX] in { + def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))), + (VPSHUFDmi addr:$src1, imm:$imm)>; + def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))), + (VPSHUFDri VR128:$src1, imm:$imm)>; +} + +let Predicates = [UseSSE2] in { + def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))), + (PSHUFDmi addr:$src1, imm:$imm)>; + def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))), + (PSHUFDri VR128:$src1, imm:$imm)>; +} + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Unpack Instructions +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { +multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, + SDNode OpNode, PatFrag bc_frag, bit Is2Addr = 1> { + def rr : PDI<opc, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], + IIC_SSE_UNPCK>, Sched<[WriteShuffle]>; + def rm : PDI<opc, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (OpNode VR128:$src1, + (bc_frag (memopv2i64 + addr:$src2))))], + IIC_SSE_UNPCK>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt, + SDNode OpNode, PatFrag bc_frag> { + def Yrr : PDI<opc, MRMSrcReg, + (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>, + Sched<[WriteShuffle]>; + def Yrm : PDI<opc, MRMSrcMem, + (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (OpNode VR256:$src1, + (bc_frag (memopv4i64 addr:$src2))))]>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +let Predicates = [HasAVX] in { + defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, + bc_v16i8, 0>, VEX_4V; + defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, + bc_v8i16, 0>, VEX_4V; + defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, + bc_v4i32, 0>, VEX_4V; + defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, + bc_v2i64, 0>, VEX_4V; + + defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, + bc_v16i8, 0>, VEX_4V; + defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, + bc_v8i16, 0>, VEX_4V; + defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, + bc_v4i32, 0>, VEX_4V; + defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, + bc_v2i64, 0>, VEX_4V; +} + +let Predicates = [HasAVX2] in { + defm VPUNPCKLBW : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl, + bc_v32i8>, VEX_4V, VEX_L; + defm VPUNPCKLWD : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl, + bc_v16i16>, VEX_4V, VEX_L; + defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl, + bc_v8i32>, VEX_4V, VEX_L; + defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, + bc_v4i64>, VEX_4V, VEX_L; + + defm VPUNPCKHBW : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh, + bc_v32i8>, VEX_4V, VEX_L; + defm VPUNPCKHWD : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh, + bc_v16i16>, VEX_4V, VEX_L; + defm VPUNPCKHDQ : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh, + bc_v8i32>, VEX_4V, VEX_L; + defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, + bc_v4i64>, VEX_4V, VEX_L; +} + +let Constraints = "$src1 = $dst" in { + defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, + bc_v16i8>; + defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, + bc_v8i16>; + defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, + bc_v4i32>; + defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, + bc_v2i64>; + + defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, + bc_v16i8>; + defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, + bc_v8i16>; + defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, + bc_v4i32>; + defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, + bc_v2i64>; +} +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Extract and Insert +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { +multiclass sse2_pinsrw<bit Is2Addr = 1> { + def rri : Ii8<0xC4, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, + GR32:$src2, i32i8imm:$src3), + !if(Is2Addr, + "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", + "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))], IIC_SSE_PINSRW>, + Sched<[WriteShuffle]>; + def rmi : Ii8<0xC4, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, + i16mem:$src2, i32i8imm:$src3), + !if(Is2Addr, + "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", + "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (X86pinsrw VR128:$src1, (extloadi16 addr:$src2), + imm:$src3))], IIC_SSE_PINSRW>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +// Extract +let Predicates = [HasAVX] in +def VPEXTRWri : Ii8<0xC5, MRMSrcReg, + (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2), + "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1), + imm:$src2))]>, TB, OpSize, VEX, + Sched<[WriteShuffle]>; +def PEXTRWri : PDIi8<0xC5, MRMSrcReg, + (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2), + "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1), + imm:$src2))], IIC_SSE_PEXTRW>, + Sched<[WriteShuffleLd, ReadAfterLd]>; + +// Insert +let Predicates = [HasAVX] in { + defm VPINSRW : sse2_pinsrw<0>, TB, OpSize, VEX_4V; + def VPINSRWrr64i : Ii8<0xC4, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, GR64:$src2, i32i8imm:$src3), + "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, TB, OpSize, VEX_4V, Sched<[WriteShuffle]>; +} + +let Constraints = "$src1 = $dst" in + defm PINSRW : sse2_pinsrw, TB, OpSize, Requires<[UseSSE2]>; + +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Mask Creation +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in { + +def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), + "pmovmskb\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))], + IIC_SSE_MOVMSK>, VEX; +def VPMOVMSKBr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), + "pmovmskb\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK>, VEX; + +let Predicates = [HasAVX2] in { +def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR256:$src), + "pmovmskb\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>, VEX, VEX_L; +def VPMOVMSKBYr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), + "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L; +} + +def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), + "pmovmskb\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))], + IIC_SSE_MOVMSK>; + +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// +// SSE2 - Conditional Store +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in { + +let Uses = [EDI] in +def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), + (ins VR128:$src, VR128:$mask), + "maskmovdqu\t{$mask, $src|$src, $mask}", + [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)], + IIC_SSE_MASKMOV>, VEX; +let Uses = [RDI] in +def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), + (ins VR128:$src, VR128:$mask), + "maskmovdqu\t{$mask, $src|$src, $mask}", + [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)], + IIC_SSE_MASKMOV>, VEX; + +let Uses = [EDI] in +def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), + "maskmovdqu\t{$mask, $src|$src, $mask}", + [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)], + IIC_SSE_MASKMOV>; +let Uses = [RDI] in +def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), + "maskmovdqu\t{$mask, $src|$src, $mask}", + [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)], + IIC_SSE_MASKMOV>; + +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// +// SSE2 - Move Doubleword +//===---------------------------------------------------------------------===// + +//===---------------------------------------------------------------------===// +// Move Int Doubleword to Packed Double Int +// +def VMOVDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>, + VEX, Sched<[WriteMove]>; +def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (scalar_to_vector (loadi32 addr:$src))))], + IIC_SSE_MOVDQ>, + VEX, Sched<[WriteLoad]>; +def VMOV64toPQIrr : VRPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (scalar_to_vector GR64:$src)))], + IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; +def VMOV64toSDrr : VRPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (bitconvert GR64:$src))], + IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; + +def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>, + Sched<[WriteMove]>; +def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (scalar_to_vector (loadi32 addr:$src))))], + IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; +def MOV64toPQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (scalar_to_vector GR64:$src)))], + IIC_SSE_MOVDQ>, Sched<[WriteMove]>; +def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (bitconvert GR64:$src))], + IIC_SSE_MOVDQ>, Sched<[WriteMove]>; + +//===---------------------------------------------------------------------===// +// Move Int Doubleword to Single Scalar +// +def VMOVDI2SSrr : VPDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (bitconvert GR32:$src))], + IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; + +def VMOVDI2SSrm : VPDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))], + IIC_SSE_MOVDQ>, + VEX, Sched<[WriteLoad]>; +def MOVDI2SSrr : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (bitconvert GR32:$src))], + IIC_SSE_MOVDQ>, Sched<[WriteMove]>; + +def MOVDI2SSrm : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))], + IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; + +//===---------------------------------------------------------------------===// +// Move Packed Doubleword Int to Packed Double Int +// +def VMOVPDI2DIrr : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), + (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX, + Sched<[WriteMove]>; +def VMOVPDI2DImr : VPDI<0x7E, MRMDestMem, (outs), + (ins i32mem:$dst, VR128:$src), + "movd\t{$src, $dst|$dst, $src}", + [(store (i32 (vector_extract (v4i32 VR128:$src), + (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, + VEX, Sched<[WriteLoad]>; +def MOVPDI2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), + (iPTR 0)))], IIC_SSE_MOVD_ToGP>, + Sched<[WriteMove]>; +def MOVPDI2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), + "movd\t{$src, $dst|$dst, $src}", + [(store (i32 (vector_extract (v4i32 VR128:$src), + (iPTR 0))), addr:$dst)], + IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; + +//===---------------------------------------------------------------------===// +// Move Packed Doubleword Int first element to Doubleword Int +// +let SchedRW = [WriteMove] in { +def VMOVPQIto64rr : VRPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (vector_extract (v2i64 VR128:$src), + (iPTR 0)))], + IIC_SSE_MOVD_ToGP>, + VEX; + +def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (vector_extract (v2i64 VR128:$src), + (iPTR 0)))], + IIC_SSE_MOVD_ToGP>; +} //SchedRW + +//===---------------------------------------------------------------------===// +// Bitcast FR64 <-> GR64 +// +let Predicates = [HasAVX] in +def VMOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>, + VEX, Sched<[WriteLoad]>; +def VMOVSDto64rr : VRPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (bitconvert FR64:$src))], + IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; +def VMOVSDto64mr : VRPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(store (i64 (bitconvert FR64:$src)), addr:$dst)], + IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; + +def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))], + IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; +def MOVSDto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (bitconvert FR64:$src))], + IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; +def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(store (i64 (bitconvert FR64:$src)), addr:$dst)], + IIC_SSE_MOVDQ>, Sched<[WriteStore]>; + +//===---------------------------------------------------------------------===// +// Move Scalar Single to Double Int +// +def VMOVSS2DIrr : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (bitconvert FR32:$src))], + IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>; +def VMOVSS2DImr : VPDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(store (i32 (bitconvert FR32:$src)), addr:$dst)], + IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; +def MOVSS2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (bitconvert FR32:$src))], + IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; +def MOVSS2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(store (i32 (bitconvert FR32:$src)), addr:$dst)], + IIC_SSE_MOVDQ>, Sched<[WriteStore]>; + +//===---------------------------------------------------------------------===// +// Patterns and instructions to describe movd/movq to XMM register zero-extends +// +let SchedRW = [WriteMove] in { +let AddedComplexity = 15 in { +def VMOVZDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v4i32 (X86vzmovl + (v4i32 (scalar_to_vector GR32:$src)))))], + IIC_SSE_MOVDQ>, VEX; +def VMOVZQI2PQIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only + [(set VR128:$dst, (v2i64 (X86vzmovl + (v2i64 (scalar_to_vector GR64:$src)))))], + IIC_SSE_MOVDQ>, + VEX, VEX_W; +} +let AddedComplexity = 15 in { +def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v4i32 (X86vzmovl + (v4i32 (scalar_to_vector GR32:$src)))))], + IIC_SSE_MOVDQ>; +def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only + [(set VR128:$dst, (v2i64 (X86vzmovl + (v2i64 (scalar_to_vector GR64:$src)))))], + IIC_SSE_MOVDQ>; +} +} // SchedRW + +let AddedComplexity = 20, SchedRW = [WriteLoad] in { +def VMOVZDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86vzmovl (v4i32 (scalar_to_vector + (loadi32 addr:$src))))))], + IIC_SSE_MOVDQ>, VEX; +def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86vzmovl (v4i32 (scalar_to_vector + (loadi32 addr:$src))))))], + IIC_SSE_MOVDQ>; +} // AddedComplexity, SchedRW + +let Predicates = [HasAVX] in { + // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part. + let AddedComplexity = 20 in { + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), + (VMOVZDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), + (VMOVZDI2PDIrm addr:$src)>; + } + // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. + def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVZDI2PDIrr GR32:$src), sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>; +} + +let Predicates = [UseSSE2], AddedComplexity = 20 in { + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), + (MOVZDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), + (MOVZDI2PDIrm addr:$src)>; +} + +// These are the correct encodings of the instructions so that we know how to +// read correct assembly, even though we continue to emit the wrong ones for +// compatibility with Darwin's buggy assembler. +def : InstAlias<"movq\t{$src, $dst|$dst, $src}", + (MOV64toPQIrr VR128:$dst, GR64:$src), 0>; +def : InstAlias<"movq\t{$src, $dst|$dst, $src}", + (MOV64toSDrr FR64:$dst, GR64:$src), 0>; +def : InstAlias<"movq\t{$src, $dst|$dst, $src}", + (MOVPQIto64rr GR64:$dst, VR128:$src), 0>; +def : InstAlias<"movq\t{$src, $dst|$dst, $src}", + (MOVSDto64rr GR64:$dst, FR64:$src), 0>; +def : InstAlias<"movq\t{$src, $dst|$dst, $src}", + (VMOVZQI2PQIrr VR128:$dst, GR64:$src), 0>; +def : InstAlias<"movq\t{$src, $dst|$dst, $src}", + (MOVZQI2PQIrr VR128:$dst, GR64:$src), 0>; + +//===---------------------------------------------------------------------===// +// SSE2 - Move Quadword +//===---------------------------------------------------------------------===// + +//===---------------------------------------------------------------------===// +// Move Quadword Int to Packed Quadword Int +// + +let SchedRW = [WriteLoad] in { +def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS, + VEX, Requires<[HasAVX]>; +def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (scalar_to_vector (loadi64 addr:$src))))], + IIC_SSE_MOVDQ>, XS, + Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix +} // SchedRW + +//===---------------------------------------------------------------------===// +// Move Packed Quadword Int to Quadword Int +// +let SchedRW = [WriteStore] in { +def VMOVPQI2QImr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), + "movq\t{$src, $dst|$dst, $src}", + [(store (i64 (vector_extract (v2i64 VR128:$src), + (iPTR 0))), addr:$dst)], + IIC_SSE_MOVDQ>, VEX; +def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), + "movq\t{$src, $dst|$dst, $src}", + [(store (i64 (vector_extract (v2i64 VR128:$src), + (iPTR 0))), addr:$dst)], + IIC_SSE_MOVDQ>; +} // SchedRW + +//===---------------------------------------------------------------------===// +// Store / copy lower 64-bits of a XMM register. +// +def VMOVLQ128mr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), + "movq\t{$src, $dst|$dst, $src}", + [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX, + Sched<[WriteStore]>; +def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), + "movq\t{$src, $dst|$dst, $src}", + [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)], + IIC_SSE_MOVDQ>, Sched<[WriteStore]>; + +let AddedComplexity = 20 in +def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (X86vzmovl (v2i64 (scalar_to_vector + (loadi64 addr:$src))))))], + IIC_SSE_MOVDQ>, + XS, VEX, Requires<[HasAVX]>, Sched<[WriteLoad]>; + +let AddedComplexity = 20 in +def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (X86vzmovl (v2i64 (scalar_to_vector + (loadi64 addr:$src))))))], + IIC_SSE_MOVDQ>, + XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>; + +let Predicates = [HasAVX], AddedComplexity = 20 in { + def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), + (VMOVZQI2PQIrm addr:$src)>; + def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))), + (VMOVZQI2PQIrm addr:$src)>; + def : Pat<(v2i64 (X86vzload addr:$src)), + (VMOVZQI2PQIrm addr:$src)>; +} + +let Predicates = [UseSSE2], AddedComplexity = 20 in { + def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), + (MOVZQI2PQIrm addr:$src)>; + def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))), + (MOVZQI2PQIrm addr:$src)>; + def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>; +} + +let Predicates = [HasAVX] in { +def : Pat<(v4i64 (alignedX86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVAPSrm addr:$src), sub_xmm)>; +def : Pat<(v4i64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVUPSrm addr:$src), sub_xmm)>; +} + +//===---------------------------------------------------------------------===// +// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in +// IA32 document. movq xmm1, xmm2 does clear the high bits. +// +let SchedRW = [WriteVecLogic] in { +let AddedComplexity = 15 in +def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))], + IIC_SSE_MOVQ_RR>, + XS, VEX, Requires<[HasAVX]>; +let AddedComplexity = 15 in +def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))], + IIC_SSE_MOVQ_RR>, + XS, Requires<[UseSSE2]>; +} // SchedRW + +let SchedRW = [WriteVecLogicLd] in { +let AddedComplexity = 20 in +def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2i64 (X86vzmovl + (loadv2i64 addr:$src))))], + IIC_SSE_MOVDQ>, + XS, VEX, Requires<[HasAVX]>; +let AddedComplexity = 20 in { +def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2i64 (X86vzmovl + (loadv2i64 addr:$src))))], + IIC_SSE_MOVDQ>, + XS, Requires<[UseSSE2]>; +} +} // SchedRW + +let AddedComplexity = 20 in { + let Predicates = [HasAVX] in { + def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), + (VMOVZPQILo2PQIrm addr:$src)>; + def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), + (VMOVZPQILo2PQIrr VR128:$src)>; + } + let Predicates = [UseSSE2] in { + def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), + (MOVZPQILo2PQIrm addr:$src)>; + def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), + (MOVZPQILo2PQIrr VR128:$src)>; + } +} + +// Instructions to match in the assembler +let SchedRW = [WriteMove] in { +def VMOVQs64rr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), + "movq\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVDQ>, VEX, VEX_W; +def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), + "movq\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVDQ>, VEX, VEX_W; +// Recognize "movd" with GR64 destination, but encode as a "movq" +def VMOVQd64rr_alt : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), + "movd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVDQ>, VEX, VEX_W; +} // SchedRW + +// Instructions for the disassembler +// xr = XMM register +// xm = mem64 + +let SchedRW = [WriteMove] in { +let Predicates = [HasAVX] in +def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vmovq\t{$src, $dst|$dst, $src}", []>, VEX, XS; +def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, XS; +} // SchedRW + +//===---------------------------------------------------------------------===// +// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP +//===---------------------------------------------------------------------===// +multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, + ValueType vt, RegisterClass RC, PatFrag mem_frag, + X86MemOperand x86memop> { +def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (vt (OpNode RC:$src)))], + IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>; +def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (OpNode (mem_frag addr:$src)))], + IIC_SSE_MOV_LH>, Sched<[WriteShuffleLd]>; +} + +let Predicates = [HasAVX] in { + defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", + v4f32, VR128, memopv4f32, f128mem>, VEX; + defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", + v4f32, VR128, memopv4f32, f128mem>, VEX; + defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", + v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L; + defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", + v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L; +} +defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128, + memopv4f32, f128mem>; +defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128, + memopv4f32, f128mem>; + +let Predicates = [HasAVX] in { + def : Pat<(v4i32 (X86Movshdup VR128:$src)), + (VMOVSHDUPrr VR128:$src)>; + def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))), + (VMOVSHDUPrm addr:$src)>; + def : Pat<(v4i32 (X86Movsldup VR128:$src)), + (VMOVSLDUPrr VR128:$src)>; + def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))), + (VMOVSLDUPrm addr:$src)>; + def : Pat<(v8i32 (X86Movshdup VR256:$src)), + (VMOVSHDUPYrr VR256:$src)>; + def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (memopv4i64 addr:$src)))), + (VMOVSHDUPYrm addr:$src)>; + def : Pat<(v8i32 (X86Movsldup VR256:$src)), + (VMOVSLDUPYrr VR256:$src)>; + def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (memopv4i64 addr:$src)))), + (VMOVSLDUPYrm addr:$src)>; +} + +let Predicates = [UseSSE3] in { + def : Pat<(v4i32 (X86Movshdup VR128:$src)), + (MOVSHDUPrr VR128:$src)>; + def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))), + (MOVSHDUPrm addr:$src)>; + def : Pat<(v4i32 (X86Movsldup VR128:$src)), + (MOVSLDUPrr VR128:$src)>; + def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))), + (MOVSLDUPrm addr:$src)>; +} + +//===---------------------------------------------------------------------===// +// SSE3 - Replicate Double FP - MOVDDUP +//===---------------------------------------------------------------------===// + +multiclass sse3_replicate_dfp<string OpcodeStr> { +let neverHasSideEffects = 1 in +def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [], IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>; +def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (v2f64 (X86Movddup + (scalar_to_vector (loadf64 addr:$src)))))], + IIC_SSE_MOV_LH>, Sched<[WriteShuffleLd]>; +} + +// FIXME: Merge with above classe when there're patterns for the ymm version +multiclass sse3_replicate_dfp_y<string OpcodeStr> { +def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>, + Sched<[WriteShuffle]>; +def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, + (v4f64 (X86Movddup + (scalar_to_vector (loadf64 addr:$src)))))]>, + Sched<[WriteShuffleLd]>; +} + +let Predicates = [HasAVX] in { + defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX; + defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L; +} + +defm MOVDDUP : sse3_replicate_dfp<"movddup">; + +let Predicates = [HasAVX] in { + def : Pat<(X86Movddup (memopv2f64 addr:$src)), + (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; + def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))), + (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; + def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))), + (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; + def : Pat<(X86Movddup (bc_v2f64 + (v2i64 (scalar_to_vector (loadi64 addr:$src))))), + (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; + + // 256-bit version + def : Pat<(X86Movddup (memopv4f64 addr:$src)), + (VMOVDDUPYrm addr:$src)>; + def : Pat<(X86Movddup (memopv4i64 addr:$src)), + (VMOVDDUPYrm addr:$src)>; + def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))), + (VMOVDDUPYrm addr:$src)>; + def : Pat<(X86Movddup (v4i64 VR256:$src)), + (VMOVDDUPYrr VR256:$src)>; +} + +let Predicates = [UseSSE3] in { + def : Pat<(X86Movddup (memopv2f64 addr:$src)), + (MOVDDUPrm addr:$src)>; + def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))), + (MOVDDUPrm addr:$src)>; + def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))), + (MOVDDUPrm addr:$src)>; + def : Pat<(X86Movddup (bc_v2f64 + (v2i64 (scalar_to_vector (loadi64 addr:$src))))), + (MOVDDUPrm addr:$src)>; +} + +//===---------------------------------------------------------------------===// +// SSE3 - Move Unaligned Integer +//===---------------------------------------------------------------------===// + +let SchedRW = [WriteLoad] in { +let Predicates = [HasAVX] in { + def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "vlddqu\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX; + def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "vlddqu\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, + VEX, VEX_L; +} +def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "lddqu\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))], + IIC_SSE_LDDQU>; +} + +//===---------------------------------------------------------------------===// +// SSE3 - Arithmetic +//===---------------------------------------------------------------------===// + +multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, OpndItins itins, + bit Is2Addr = 1> { + def rr : I<0xD0, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>, + Sched<[itins.Sched]>; + def rm : I<0xD0, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))], itins.rr>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +let Predicates = [HasAVX] in { + let ExeDomain = SSEPackedSingle in { + defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128, + f128mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V; + defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256, + f256mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V, VEX_L; + } + let ExeDomain = SSEPackedDouble in { + defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128, + f128mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V; + defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256, + f256mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V, VEX_L; + } +} +let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { + let ExeDomain = SSEPackedSingle in + defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128, + f128mem, SSE_ALU_F32P>, TB, XD; + let ExeDomain = SSEPackedDouble in + defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128, + f128mem, SSE_ALU_F64P>, TB, OpSize; +} + +//===---------------------------------------------------------------------===// +// SSE3 Instructions +//===---------------------------------------------------------------------===// + +// Horizontal ops +multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, + X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> { + def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>, + Sched<[WriteFAdd]>; + + def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))], + IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>; +} +multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, + X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> { + def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>, + Sched<[WriteFAdd]>; + + def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))], + IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>; +} + +let Predicates = [HasAVX] in { + let ExeDomain = SSEPackedSingle in { + defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, + X86fhadd, 0>, VEX_4V; + defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, + X86fhsub, 0>, VEX_4V; + defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, + X86fhadd, 0>, VEX_4V, VEX_L; + defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, + X86fhsub, 0>, VEX_4V, VEX_L; + } + let ExeDomain = SSEPackedDouble in { + defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem, + X86fhadd, 0>, VEX_4V; + defm VHSUBPD : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem, + X86fhsub, 0>, VEX_4V; + defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem, + X86fhadd, 0>, VEX_4V, VEX_L; + defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem, + X86fhsub, 0>, VEX_4V, VEX_L; + } +} + +let Constraints = "$src1 = $dst" in { + let ExeDomain = SSEPackedSingle in { + defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd>; + defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub>; + } + let ExeDomain = SSEPackedDouble in { + defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd>; + defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub>; + } +} + +//===---------------------------------------------------------------------===// +// SSSE3 - Packed Absolute Instructions +//===---------------------------------------------------------------------===// + + +/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. +multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, + Intrinsic IntId128> { + def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (IntId128 VR128:$src))], IIC_SSE_PABS_RR>, + OpSize, Sched<[WriteVecALU]>; + + def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (IntId128 + (bitconvert (memopv2i64 addr:$src))))], IIC_SSE_PABS_RM>, + OpSize, Sched<[WriteVecALULd]>; +} + +/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. +multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr, + Intrinsic IntId256> { + def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (IntId256 VR256:$src))]>, + OpSize, Sched<[WriteVecALU]>; + + def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst), + (ins i256mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, + (IntId256 + (bitconvert (memopv4i64 addr:$src))))]>, OpSize, + Sched<[WriteVecALULd]>; +} + +// Helper fragments to match sext vXi1 to vXiY. +def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)), + VR128:$src))>; +def v8i1sextv8i16 : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i32 15)))>; +def v4i1sextv4i32 : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i32 31)))>; +def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)), + VR256:$src))>; +def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i32 15)))>; +def v8i1sextv8i32 : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i32 31)))>; + +let Predicates = [HasAVX] in { + defm VPABSB : SS3I_unop_rm_int<0x1C, "vpabsb", + int_x86_ssse3_pabs_b_128>, VEX; + defm VPABSW : SS3I_unop_rm_int<0x1D, "vpabsw", + int_x86_ssse3_pabs_w_128>, VEX; + defm VPABSD : SS3I_unop_rm_int<0x1E, "vpabsd", + int_x86_ssse3_pabs_d_128>, VEX; + + def : Pat<(xor + (bc_v2i64 (v16i1sextv16i8)), + (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))), + (VPABSBrr128 VR128:$src)>; + def : Pat<(xor + (bc_v2i64 (v8i1sextv8i16)), + (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))), + (VPABSWrr128 VR128:$src)>; + def : Pat<(xor + (bc_v2i64 (v4i1sextv4i32)), + (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))), + (VPABSDrr128 VR128:$src)>; +} + +let Predicates = [HasAVX2] in { + defm VPABSB : SS3I_unop_rm_int_y<0x1C, "vpabsb", + int_x86_avx2_pabs_b>, VEX, VEX_L; + defm VPABSW : SS3I_unop_rm_int_y<0x1D, "vpabsw", + int_x86_avx2_pabs_w>, VEX, VEX_L; + defm VPABSD : SS3I_unop_rm_int_y<0x1E, "vpabsd", + int_x86_avx2_pabs_d>, VEX, VEX_L; + + def : Pat<(xor + (bc_v4i64 (v32i1sextv32i8)), + (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))), + (VPABSBrr256 VR256:$src)>; + def : Pat<(xor + (bc_v4i64 (v16i1sextv16i16)), + (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))), + (VPABSWrr256 VR256:$src)>; + def : Pat<(xor + (bc_v4i64 (v8i1sextv8i32)), + (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))), + (VPABSDrr256 VR256:$src)>; +} + +defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", + int_x86_ssse3_pabs_b_128>; +defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", + int_x86_ssse3_pabs_w_128>; +defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", + int_x86_ssse3_pabs_d_128>; + +let Predicates = [HasSSSE3] in { + def : Pat<(xor + (bc_v2i64 (v16i1sextv16i8)), + (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))), + (PABSBrr128 VR128:$src)>; + def : Pat<(xor + (bc_v2i64 (v8i1sextv8i16)), + (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))), + (PABSWrr128 VR128:$src)>; + def : Pat<(xor + (bc_v2i64 (v4i1sextv4i32)), + (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))), + (PABSDrr128 VR128:$src)>; +} + +//===---------------------------------------------------------------------===// +// SSSE3 - Packed Binary Operator Instructions +//===---------------------------------------------------------------------===// + +let Sched = WriteVecALU in { +def SSE_PHADDSUBD : OpndItins< + IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM +>; +def SSE_PHADDSUBSW : OpndItins< + IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM +>; +def SSE_PHADDSUBW : OpndItins< + IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM +>; +} +let Sched = WriteShuffle in +def SSE_PSHUFB : OpndItins< + IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM +>; +let Sched = WriteVecALU in +def SSE_PSIGN : OpndItins< + IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM +>; +let Sched = WriteVecIMul in +def SSE_PMULHRSW : OpndItins< + IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW +>; + +/// SS3I_binop_rm - Simple SSSE3 bin op +multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, OpndItins itins, + bit Is2Addr = 1> { + let isCommutable = 1 in + def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>, + OpSize, Sched<[itins.Sched]>; + def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, + (bitconvert (memop_frag addr:$src2)))))], itins.rm>, OpSize, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}. +multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, + Intrinsic IntId128, OpndItins itins, + bit Is2Addr = 1> { + let isCommutable = 1 in + def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, + OpSize, Sched<[itins.Sched]>; + def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (IntId128 VR128:$src1, + (bitconvert (memopv2i64 addr:$src2))))]>, OpSize, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, + Intrinsic IntId256> { + let isCommutable = 1 in + def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, + OpSize; + def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (IntId256 VR256:$src1, + (bitconvert (memopv4i64 addr:$src2))))]>, OpSize; +} + +let ImmT = NoImm, Predicates = [HasAVX] in { +let isCommutable = 0 in { + defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128, + memopv2i64, i128mem, + SSE_PHADDSUBW, 0>, VEX_4V; + defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128, + memopv2i64, i128mem, + SSE_PHADDSUBD, 0>, VEX_4V; + defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128, + memopv2i64, i128mem, + SSE_PHADDSUBW, 0>, VEX_4V; + defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128, + memopv2i64, i128mem, + SSE_PHADDSUBD, 0>, VEX_4V; + defm VPSIGNB : SS3I_binop_rm<0x08, "vpsignb", X86psign, v16i8, VR128, + memopv2i64, i128mem, + SSE_PSIGN, 0>, VEX_4V; + defm VPSIGNW : SS3I_binop_rm<0x09, "vpsignw", X86psign, v8i16, VR128, + memopv2i64, i128mem, + SSE_PSIGN, 0>, VEX_4V; + defm VPSIGND : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v4i32, VR128, + memopv2i64, i128mem, + SSE_PSIGN, 0>, VEX_4V; + defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128, + memopv2i64, i128mem, + SSE_PSHUFB, 0>, VEX_4V; + defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", + int_x86_ssse3_phadd_sw_128, + SSE_PHADDSUBSW, 0>, VEX_4V; + defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", + int_x86_ssse3_phsub_sw_128, + SSE_PHADDSUBSW, 0>, VEX_4V; + defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw", + int_x86_ssse3_pmadd_ub_sw_128, + SSE_PMADD, 0>, VEX_4V; +} +defm VPMULHRSW : SS3I_binop_rm_int<0x0B, "vpmulhrsw", + int_x86_ssse3_pmul_hr_sw_128, + SSE_PMULHRSW, 0>, VEX_4V; +} + +let ImmT = NoImm, Predicates = [HasAVX2] in { +let isCommutable = 0 in { + defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256, + memopv4i64, i256mem, + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; + defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256, + memopv4i64, i256mem, + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; + defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256, + memopv4i64, i256mem, + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; + defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256, + memopv4i64, i256mem, + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; + defm VPSIGNBY : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256, + memopv4i64, i256mem, + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; + defm VPSIGNWY : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256, + memopv4i64, i256mem, + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; + defm VPSIGNDY : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256, + memopv4i64, i256mem, + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; + defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256, + memopv4i64, i256mem, + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; + defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", + int_x86_avx2_phadd_sw>, VEX_4V, VEX_L; + defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", + int_x86_avx2_phsub_sw>, VEX_4V, VEX_L; + defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw", + int_x86_avx2_pmadd_ub_sw>, VEX_4V, VEX_L; +} +defm VPMULHRSW : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw", + int_x86_avx2_pmul_hr_sw>, VEX_4V, VEX_L; +} + +// None of these have i8 immediate fields. +let ImmT = NoImm, Constraints = "$src1 = $dst" in { +let isCommutable = 0 in { + defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, VR128, + memopv2i64, i128mem, SSE_PHADDSUBW>; + defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, VR128, + memopv2i64, i128mem, SSE_PHADDSUBD>; + defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, VR128, + memopv2i64, i128mem, SSE_PHADDSUBW>; + defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, VR128, + memopv2i64, i128mem, SSE_PHADDSUBD>; + defm PSIGNB : SS3I_binop_rm<0x08, "psignb", X86psign, v16i8, VR128, + memopv2i64, i128mem, SSE_PSIGN>; + defm PSIGNW : SS3I_binop_rm<0x09, "psignw", X86psign, v8i16, VR128, + memopv2i64, i128mem, SSE_PSIGN>; + defm PSIGND : SS3I_binop_rm<0x0A, "psignd", X86psign, v4i32, VR128, + memopv2i64, i128mem, SSE_PSIGN>; + defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, VR128, + memopv2i64, i128mem, SSE_PSHUFB>; + defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", + int_x86_ssse3_phadd_sw_128, + SSE_PHADDSUBSW>; + defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", + int_x86_ssse3_phsub_sw_128, + SSE_PHADDSUBSW>; + defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw", + int_x86_ssse3_pmadd_ub_sw_128, SSE_PMADD>; +} +defm PMULHRSW : SS3I_binop_rm_int<0x0B, "pmulhrsw", + int_x86_ssse3_pmul_hr_sw_128, + SSE_PMULHRSW>; +} + +//===---------------------------------------------------------------------===// +// SSSE3 - Packed Align Instruction Patterns +//===---------------------------------------------------------------------===// + +multiclass ssse3_palignr<string asm, bit Is2Addr = 1> { + let neverHasSideEffects = 1 in { + def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [], IIC_SSE_PALIGNR>, OpSize, Sched<[WriteShuffle]>; + let mayLoad = 1 in + def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [], IIC_SSE_PALIGNR>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>; + } +} + +multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> { + let neverHasSideEffects = 1 in { + def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, i8imm:$src3), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, OpSize, Sched<[WriteShuffle]>; + let mayLoad = 1 in + def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i256mem:$src2, i8imm:$src3), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>; + } +} + +let Predicates = [HasAVX] in + defm VPALIGN : ssse3_palignr<"vpalignr", 0>, VEX_4V; +let Predicates = [HasAVX2] in + defm VPALIGN : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L; +let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in + defm PALIGN : ssse3_palignr<"palignr">; + +let Predicates = [HasAVX2] in { +def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; +def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; +def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; +def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; +} + +let Predicates = [HasAVX] in { +def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; +def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; +def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; +def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; +} + +let Predicates = [UseSSSE3] in { +def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; +def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; +def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; +def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; +} + +//===---------------------------------------------------------------------===// +// SSSE3 - Thread synchronization +//===---------------------------------------------------------------------===// + +let SchedRW = [WriteSystem] in { +let usesCustomInserter = 1 in { +def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3), + [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>, + Requires<[HasSSE3]>; +} + +let Uses = [EAX, ECX, EDX] in +def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>, + TB, Requires<[HasSSE3]>; +let Uses = [ECX, EAX] in +def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", + [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>, + TB, Requires<[HasSSE3]>; +} // SchedRW + +def : InstAlias<"mwait %eax, %ecx", (MWAITrr)>, Requires<[In32BitMode]>; +def : InstAlias<"mwait %rax, %rcx", (MWAITrr)>, Requires<[In64BitMode]>; + +def : InstAlias<"monitor %eax, %ecx, %edx", (MONITORrrr)>, + Requires<[In32BitMode]>; +def : InstAlias<"monitor %rax, %rcx, %rdx", (MONITORrrr)>, + Requires<[In64BitMode]>; + +//===----------------------------------------------------------------------===// +// SSE4.1 - Packed Move with Sign/Zero Extend +//===----------------------------------------------------------------------===// + +multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> { + def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (IntId VR128:$src))]>, OpSize; + + def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>, + OpSize; +} + +multiclass SS41I_binop_rm_int16_y<bits<8> opc, string OpcodeStr, + Intrinsic IntId> { + def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (IntId VR128:$src))]>, OpSize; + + def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (IntId (load addr:$src)))]>, OpSize; +} + +let Predicates = [HasAVX] in { +defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", int_x86_sse41_pmovsxbw>, + VEX; +defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd", int_x86_sse41_pmovsxwd>, + VEX; +defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq", int_x86_sse41_pmovsxdq>, + VEX; +defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw", int_x86_sse41_pmovzxbw>, + VEX; +defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd", int_x86_sse41_pmovzxwd>, + VEX; +defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq", int_x86_sse41_pmovzxdq>, + VEX; +} + +let Predicates = [HasAVX2] in { +defm VPMOVSXBW : SS41I_binop_rm_int16_y<0x20, "vpmovsxbw", + int_x86_avx2_pmovsxbw>, VEX, VEX_L; +defm VPMOVSXWD : SS41I_binop_rm_int16_y<0x23, "vpmovsxwd", + int_x86_avx2_pmovsxwd>, VEX, VEX_L; +defm VPMOVSXDQ : SS41I_binop_rm_int16_y<0x25, "vpmovsxdq", + int_x86_avx2_pmovsxdq>, VEX, VEX_L; +defm VPMOVZXBW : SS41I_binop_rm_int16_y<0x30, "vpmovzxbw", + int_x86_avx2_pmovzxbw>, VEX, VEX_L; +defm VPMOVZXWD : SS41I_binop_rm_int16_y<0x33, "vpmovzxwd", + int_x86_avx2_pmovzxwd>, VEX, VEX_L; +defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq", + int_x86_avx2_pmovzxdq>, VEX, VEX_L; +} + +defm PMOVSXBW : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>; +defm PMOVSXWD : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd>; +defm PMOVSXDQ : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq>; +defm PMOVZXBW : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>; +defm PMOVZXWD : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>; +defm PMOVZXDQ : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>; + +let Predicates = [HasAVX] in { + // Common patterns involving scalar load. + def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)), + (VPMOVSXBWrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)), + (VPMOVSXBWrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))), + (VPMOVSXBWrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)), + (VPMOVSXWDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)), + (VPMOVSXWDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))), + (VPMOVSXWDrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)), + (VPMOVSXDQrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)), + (VPMOVSXDQrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))), + (VPMOVSXDQrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)), + (VPMOVZXBWrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)), + (VPMOVZXBWrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))), + (VPMOVZXBWrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)), + (VPMOVZXWDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)), + (VPMOVZXWDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))), + (VPMOVZXWDrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)), + (VPMOVZXDQrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)), + (VPMOVZXDQrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))), + (VPMOVZXDQrm addr:$src)>; +} + +let Predicates = [UseSSE41] in { + // Common patterns involving scalar load. + def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)), + (PMOVSXBWrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)), + (PMOVSXBWrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))), + (PMOVSXBWrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)), + (PMOVSXWDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)), + (PMOVSXWDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))), + (PMOVSXWDrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)), + (PMOVSXDQrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)), + (PMOVSXDQrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))), + (PMOVSXDQrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)), + (PMOVZXBWrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)), + (PMOVZXBWrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))), + (PMOVZXBWrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)), + (PMOVZXWDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)), + (PMOVZXWDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))), + (PMOVZXWDrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)), + (PMOVZXDQrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)), + (PMOVZXDQrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))), + (PMOVZXDQrm addr:$src)>; +} + +let Predicates = [HasAVX2] in { + let AddedComplexity = 15 in { + def : Pat<(v4i64 (X86vzmovly (v4i32 VR128:$src))), + (VPMOVZXDQYrr VR128:$src)>; + def : Pat<(v8i32 (X86vzmovly (v8i16 VR128:$src))), + (VPMOVZXWDYrr VR128:$src)>; + } + + def : Pat<(v4i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>; + def : Pat<(v8i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>; +} + +let Predicates = [HasAVX] in { + def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>; + def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>; +} + +let Predicates = [UseSSE41] in { + def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>; + def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>; +} + + +multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> { + def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (IntId VR128:$src))]>, OpSize; + + def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>, + OpSize; +} + +multiclass SS41I_binop_rm_int8_y<bits<8> opc, string OpcodeStr, + Intrinsic IntId> { + def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (IntId VR128:$src))]>, OpSize; + + def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i32mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, + (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>, + OpSize; +} + +let Predicates = [HasAVX] in { +defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd>, + VEX; +defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq>, + VEX; +defm VPMOVZXBD : SS41I_binop_rm_int4<0x31, "vpmovzxbd", int_x86_sse41_pmovzxbd>, + VEX; +defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq>, + VEX; +} + +let Predicates = [HasAVX2] in { +defm VPMOVSXBD : SS41I_binop_rm_int8_y<0x21, "vpmovsxbd", + int_x86_avx2_pmovsxbd>, VEX, VEX_L; +defm VPMOVSXWQ : SS41I_binop_rm_int8_y<0x24, "vpmovsxwq", + int_x86_avx2_pmovsxwq>, VEX, VEX_L; +defm VPMOVZXBD : SS41I_binop_rm_int8_y<0x31, "vpmovzxbd", + int_x86_avx2_pmovzxbd>, VEX, VEX_L; +defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq", + int_x86_avx2_pmovzxwq>, VEX, VEX_L; +} + +defm PMOVSXBD : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>; +defm PMOVSXWQ : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>; +defm PMOVZXBD : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>; +defm PMOVZXWQ : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>; + +let Predicates = [HasAVX] in { + // Common patterns involving scalar load + def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)), + (VPMOVSXBDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)), + (VPMOVSXWQrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)), + (VPMOVZXBDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)), + (VPMOVZXWQrm addr:$src)>; +} + +let Predicates = [UseSSE41] in { + // Common patterns involving scalar load + def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)), + (PMOVSXBDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)), + (PMOVSXWQrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)), + (PMOVZXBDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)), + (PMOVZXWQrm addr:$src)>; +} + +multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> { + def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (IntId VR128:$src))]>, OpSize; + + // Expecting a i16 load any extended to i32 value. + def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (IntId (bitconvert + (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>, + OpSize; +} + +multiclass SS41I_binop_rm_int4_y<bits<8> opc, string OpcodeStr, + Intrinsic IntId> { + def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (IntId VR128:$src))]>, OpSize; + + // Expecting a i16 load any extended to i32 value. + def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i16mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (IntId (bitconvert + (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>, + OpSize; +} + +let Predicates = [HasAVX] in { +defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq>, + VEX; +defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq>, + VEX; +} +let Predicates = [HasAVX2] in { +defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq", + int_x86_avx2_pmovsxbq>, VEX, VEX_L; +defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq", + int_x86_avx2_pmovzxbq>, VEX, VEX_L; +} +defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>; +defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>; + +let Predicates = [HasAVX2] in { + def : Pat<(v16i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>; + def : Pat<(v8i32 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDYrr VR128:$src)>; + def : Pat<(v4i64 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQYrr VR128:$src)>; + + def : Pat<(v8i32 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>; + def : Pat<(v4i64 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQYrr VR128:$src)>; + + def : Pat<(v4i64 (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>; + + def : Pat<(v16i16 (X86vsext (v32i8 VR256:$src))), + (VPMOVSXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + def : Pat<(v8i32 (X86vsext (v32i8 VR256:$src))), + (VPMOVSXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + def : Pat<(v4i64 (X86vsext (v32i8 VR256:$src))), + (VPMOVSXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + + def : Pat<(v8i32 (X86vsext (v16i16 VR256:$src))), + (VPMOVSXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + def : Pat<(v4i64 (X86vsext (v16i16 VR256:$src))), + (VPMOVSXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + + def : Pat<(v4i64 (X86vsext (v8i32 VR256:$src))), + (VPMOVSXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + + def : Pat<(v8i32 (X86vsmovl (v8i16 (bitconvert (v2i64 (load addr:$src)))))), + (VPMOVSXWDYrm addr:$src)>; + def : Pat<(v4i64 (X86vsmovl (v4i32 (bitconvert (v2i64 (load addr:$src)))))), + (VPMOVSXDQYrm addr:$src)>; + + def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVSXBDYrm addr:$src)>; + def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVSXBDYrm addr:$src)>; + + def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVSXWQYrm addr:$src)>; + def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVSXWQYrm addr:$src)>; + + def : Pat<(v4i64 (X86vsext (v16i8 (bitconvert (v4i32 + (scalar_to_vector (loadi32 addr:$src))))))), + (VPMOVSXBQYrm addr:$src)>; +} + +let Predicates = [HasAVX] in { + // Common patterns involving scalar load + def : Pat<(int_x86_sse41_pmovsxbq + (bitconvert (v4i32 (X86vzmovl + (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (VPMOVSXBQrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovzxbq + (bitconvert (v4i32 (X86vzmovl + (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (VPMOVZXBQrm addr:$src)>; +} + +let Predicates = [UseSSE41] in { + def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (PMOVSXBWrr VR128:$src)>; + def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (PMOVSXBDrr VR128:$src)>; + def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (PMOVSXBQrr VR128:$src)>; + + def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>; + def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (PMOVSXWQrr VR128:$src)>; + + def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>; + + // Common patterns involving scalar load + def : Pat<(int_x86_sse41_pmovsxbq + (bitconvert (v4i32 (X86vzmovl + (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (PMOVSXBQrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovzxbq + (bitconvert (v4i32 (X86vzmovl + (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (PMOVZXBQrm addr:$src)>; + + def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (PMOVSXWDrm addr:$src)>; + def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (PMOVSXWDrm addr:$src)>; + def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32 + (scalar_to_vector (loadi32 addr:$src))))))), + (PMOVSXBDrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32 + (scalar_to_vector (loadi32 addr:$src))))))), + (PMOVSXWQrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32 + (scalar_to_vector (extloadi32i16 addr:$src))))))), + (PMOVSXBQrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (PMOVSXDQrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (PMOVSXDQrm addr:$src)>; + def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (PMOVSXBWrm addr:$src)>; + def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (PMOVSXBWrm addr:$src)>; +} + +let Predicates = [HasAVX2] in { + def : Pat<(v16i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWYrr VR128:$src)>; + def : Pat<(v8i32 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDYrr VR128:$src)>; + def : Pat<(v4i64 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQYrr VR128:$src)>; + + def : Pat<(v8i32 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDYrr VR128:$src)>; + def : Pat<(v4i64 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQYrr VR128:$src)>; + + def : Pat<(v4i64 (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQYrr VR128:$src)>; + + def : Pat<(v16i16 (X86vzext (v32i8 VR256:$src))), + (VPMOVZXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + def : Pat<(v8i32 (X86vzext (v32i8 VR256:$src))), + (VPMOVZXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + def : Pat<(v4i64 (X86vzext (v32i8 VR256:$src))), + (VPMOVZXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + + def : Pat<(v8i32 (X86vzext (v16i16 VR256:$src))), + (VPMOVZXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + def : Pat<(v4i64 (X86vzext (v16i16 VR256:$src))), + (VPMOVZXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + + def : Pat<(v4i64 (X86vzext (v8i32 VR256:$src))), + (VPMOVZXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; +} + +let Predicates = [HasAVX] in { + def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWrr VR128:$src)>; + def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDrr VR128:$src)>; + def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQrr VR128:$src)>; + + def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDrr VR128:$src)>; + def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQrr VR128:$src)>; + + def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQrr VR128:$src)>; + + def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVZXBWrm addr:$src)>; + def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVZXBWrm addr:$src)>; + def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (VPMOVZXBDrm addr:$src)>; + def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))), + (VPMOVZXBQrm addr:$src)>; + + def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVZXWDrm addr:$src)>; + def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVZXWDrm addr:$src)>; + def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (VPMOVZXWQrm addr:$src)>; + + def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVZXDQrm addr:$src)>; + def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVZXDQrm addr:$src)>; + def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))), + (VPMOVZXDQrm addr:$src)>; + + def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWrr VR128:$src)>; + def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDrr VR128:$src)>; + def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQrr VR128:$src)>; + + def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>; + def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQrr VR128:$src)>; + + def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>; + + def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVSXWDrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVSXDQrm addr:$src)>; + def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVSXWDrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVSXDQrm addr:$src)>; + def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVSXBWrm addr:$src)>; + def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVSXBWrm addr:$src)>; + + def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32 + (scalar_to_vector (loadi32 addr:$src))))))), + (VPMOVSXBDrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32 + (scalar_to_vector (loadi32 addr:$src))))))), + (VPMOVSXWQrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32 + (scalar_to_vector (extloadi32i16 addr:$src))))))), + (VPMOVSXBQrm addr:$src)>; +} + +let Predicates = [UseSSE41] in { + def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (PMOVZXBWrr VR128:$src)>; + def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (PMOVZXBDrr VR128:$src)>; + def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (PMOVZXBQrr VR128:$src)>; + + def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (PMOVZXWDrr VR128:$src)>; + def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (PMOVZXWQrr VR128:$src)>; + + def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (PMOVZXDQrr VR128:$src)>; + + def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + (PMOVZXBWrm addr:$src)>; + def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), + (PMOVZXBWrm addr:$src)>; + def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (PMOVZXBDrm addr:$src)>; + def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))), + (PMOVZXBQrm addr:$src)>; + + def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + (PMOVZXWDrm addr:$src)>; + def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), + (PMOVZXWDrm addr:$src)>; + def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (PMOVZXWQrm addr:$src)>; + + def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + (PMOVZXDQrm addr:$src)>; + def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), + (PMOVZXDQrm addr:$src)>; + def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))), + (PMOVZXDQrm addr:$src)>; +} + +//===----------------------------------------------------------------------===// +// SSE4.1 - Extract Instructions +//===----------------------------------------------------------------------===// + +/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem +multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { + def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), + (ins VR128:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set GR32:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>, + OpSize; + let neverHasSideEffects = 1, mayStore = 1 in + def mr : SS4AIi8<opc, MRMDestMem, (outs), + (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, OpSize; +// FIXME: +// There's an AssertZext in the way of writing the store pattern +// (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst) +} + +let Predicates = [HasAVX] in { + defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX; + def VPEXTRBrr64 : SS4AIi8<0x14, MRMDestReg, (outs GR64:$dst), + (ins VR128:$src1, i32i8imm:$src2), + "vpextrb\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, OpSize, VEX; +} + +defm PEXTRB : SS41I_extract8<0x14, "pextrb">; + + +/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination +multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { + let neverHasSideEffects = 1, mayStore = 1 in + def mr : SS4AIi8<opc, MRMDestMem, (outs), + (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, OpSize; +// FIXME: +// There's an AssertZext in the way of writing the store pattern +// (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst) +} + +let Predicates = [HasAVX] in + defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX; + +defm PEXTRW : SS41I_extract16<0x15, "pextrw">; + + +/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination +multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { + def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), + (ins VR128:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set GR32:$dst, + (extractelt (v4i32 VR128:$src1), imm:$src2))]>, OpSize; + def mr : SS4AIi8<opc, MRMDestMem, (outs), + (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(store (extractelt (v4i32 VR128:$src1), imm:$src2), + addr:$dst)]>, OpSize; +} + +let Predicates = [HasAVX] in + defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX; + +defm PEXTRD : SS41I_extract32<0x16, "pextrd">; + +/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination +multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { + def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst), + (ins VR128:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set GR64:$dst, + (extractelt (v2i64 VR128:$src1), imm:$src2))]>, OpSize, REX_W; + def mr : SS4AIi8<opc, MRMDestMem, (outs), + (ins i64mem:$dst, VR128:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(store (extractelt (v2i64 VR128:$src1), imm:$src2), + addr:$dst)]>, OpSize, REX_W; +} + +let Predicates = [HasAVX] in + defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W; + +defm PEXTRQ : SS41I_extract64<0x16, "pextrq">; + +/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory +/// destination +multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> { + def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), + (ins VR128:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set GR32:$dst, + (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>, + OpSize; + def mr : SS4AIi8<opc, MRMDestMem, (outs), + (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2), + addr:$dst)]>, OpSize; +} + +let ExeDomain = SSEPackedSingle in { + let Predicates = [HasAVX] in { + defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX; + def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst), + (ins VR128:$src1, i32i8imm:$src2), + "vextractps \t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, OpSize, VEX; + } + defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; +} + +// Also match an EXTRACTPS store when the store is done as f32 instead of i32. +def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), + imm:$src2))), + addr:$dst), + (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, + Requires<[HasAVX]>; +def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), + imm:$src2))), + addr:$dst), + (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, + Requires<[UseSSE41]>; + +//===----------------------------------------------------------------------===// +// SSE4.1 - Insert Instructions +//===----------------------------------------------------------------------===// + +multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { + def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, GR32:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (X86pinsrb VR128:$src1, GR32:$src2, imm:$src3))]>, OpSize; + def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), + imm:$src3))]>, OpSize; +} + +let Predicates = [HasAVX] in + defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V; +let Constraints = "$src1 = $dst" in + defm PINSRB : SS41I_insert8<0x20, "pinsrb">; + +multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { + def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, GR32:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, + OpSize; + def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), + imm:$src3)))]>, OpSize; +} + +let Predicates = [HasAVX] in + defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V; +let Constraints = "$src1 = $dst" in + defm PINSRD : SS41I_insert32<0x22, "pinsrd">; + +multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { + def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, GR64:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, + OpSize; + def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i64mem:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), + imm:$src3)))]>, OpSize; +} + +let Predicates = [HasAVX] in + defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W; +let Constraints = "$src1 = $dst" in + defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W; + +// insertps has a few different modes, there's the first two here below which +// are optimized inserts that won't zero arbitrary elements in the destination +// vector. The next one matches the intrinsic and could zero arbitrary elements +// in the target vector. +multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { + def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, u32u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))]>, + OpSize; + def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (X86insrtps VR128:$src1, + (v4f32 (scalar_to_vector (loadf32 addr:$src2))), + imm:$src3))]>, OpSize; +} + +let ExeDomain = SSEPackedSingle in { + let Predicates = [HasAVX] in + defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V; + let Constraints = "$src1 = $dst" in + defm INSERTPS : SS41I_insertf32<0x21, "insertps">; +} + +//===----------------------------------------------------------------------===// +// SSE4.1 - Round Instructions +//===----------------------------------------------------------------------===// + +multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr, + X86MemOperand x86memop, RegisterClass RC, + PatFrag mem_frag32, PatFrag mem_frag64, + Intrinsic V4F32Int, Intrinsic V2F64Int> { +let ExeDomain = SSEPackedSingle in { + // Intrinsic operation, reg. + // Vector intrinsic operation, reg + def PSr : SS4AIi8<opcps, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))]>, + OpSize; + + // Vector intrinsic operation, mem + def PSm : SS4AIi8<opcps, MRMSrcMem, + (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, + (V4F32Int (mem_frag32 addr:$src1),imm:$src2))]>, + OpSize; +} // ExeDomain = SSEPackedSingle + +let ExeDomain = SSEPackedDouble in { + // Vector intrinsic operation, reg + def PDr : SS4AIi8<opcpd, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))]>, + OpSize; + + // Vector intrinsic operation, mem + def PDm : SS4AIi8<opcpd, MRMSrcMem, + (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, + (V2F64Int (mem_frag64 addr:$src1),imm:$src2))]>, + OpSize; +} // ExeDomain = SSEPackedDouble +} + +multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd, + string OpcodeStr, + Intrinsic F32Int, + Intrinsic F64Int, bit Is2Addr = 1> { +let ExeDomain = GenericDomain in { + // Operation, reg. + let hasSideEffects = 0 in + def SSr : SS4AIi8<opcss, MRMSrcReg, + (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + []>, OpSize; + + // Intrinsic operation, reg. + def SSr_Int : SS4AIi8<opcss, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>, + OpSize; + + // Intrinsic operation, mem. + def SSm : SS4AIi8<opcss, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>, + OpSize; + + // Operation, reg. + let hasSideEffects = 0 in + def SDr : SS4AIi8<opcsd, MRMSrcReg, + (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + []>, OpSize; + + // Intrinsic operation, reg. + def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>, + OpSize; + + // Intrinsic operation, mem. + def SDm : SS4AIi8<opcsd, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>, + OpSize; +} // ExeDomain = GenericDomain +} + +// FP round - roundss, roundps, roundsd, roundpd +let Predicates = [HasAVX] in { + // Intrinsic form + defm VROUND : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128, + memopv4f32, memopv2f64, + int_x86_sse41_round_ps, + int_x86_sse41_round_pd>, VEX; + defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256, + memopv8f32, memopv4f64, + int_x86_avx_round_ps_256, + int_x86_avx_round_pd_256>, VEX, VEX_L; + defm VROUND : sse41_fp_binop_rm<0x0A, 0x0B, "vround", + int_x86_sse41_round_ss, + int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG; + + def : Pat<(ffloor FR32:$src), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>; + def : Pat<(f64 (ffloor FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>; + def : Pat<(f32 (fnearbyint FR32:$src)), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; + def : Pat<(f64 (fnearbyint FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; + def : Pat<(f32 (fceil FR32:$src)), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>; + def : Pat<(f64 (fceil FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>; + def : Pat<(f32 (frint FR32:$src)), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; + def : Pat<(f64 (frint FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; + def : Pat<(f32 (ftrunc FR32:$src)), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>; + def : Pat<(f64 (ftrunc FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>; + + def : Pat<(v4f32 (ffloor VR128:$src)), + (VROUNDPSr VR128:$src, (i32 0x1))>; + def : Pat<(v4f32 (fnearbyint VR128:$src)), + (VROUNDPSr VR128:$src, (i32 0xC))>; + def : Pat<(v4f32 (fceil VR128:$src)), + (VROUNDPSr VR128:$src, (i32 0x2))>; + def : Pat<(v4f32 (frint VR128:$src)), + (VROUNDPSr VR128:$src, (i32 0x4))>; + def : Pat<(v4f32 (ftrunc VR128:$src)), + (VROUNDPSr VR128:$src, (i32 0x3))>; + + def : Pat<(v2f64 (ffloor VR128:$src)), + (VROUNDPDr VR128:$src, (i32 0x1))>; + def : Pat<(v2f64 (fnearbyint VR128:$src)), + (VROUNDPDr VR128:$src, (i32 0xC))>; + def : Pat<(v2f64 (fceil VR128:$src)), + (VROUNDPDr VR128:$src, (i32 0x2))>; + def : Pat<(v2f64 (frint VR128:$src)), + (VROUNDPDr VR128:$src, (i32 0x4))>; + def : Pat<(v2f64 (ftrunc VR128:$src)), + (VROUNDPDr VR128:$src, (i32 0x3))>; + + def : Pat<(v8f32 (ffloor VR256:$src)), + (VROUNDYPSr VR256:$src, (i32 0x1))>; + def : Pat<(v8f32 (fnearbyint VR256:$src)), + (VROUNDYPSr VR256:$src, (i32 0xC))>; + def : Pat<(v8f32 (fceil VR256:$src)), + (VROUNDYPSr VR256:$src, (i32 0x2))>; + def : Pat<(v8f32 (frint VR256:$src)), + (VROUNDYPSr VR256:$src, (i32 0x4))>; + def : Pat<(v8f32 (ftrunc VR256:$src)), + (VROUNDYPSr VR256:$src, (i32 0x3))>; + + def : Pat<(v4f64 (ffloor VR256:$src)), + (VROUNDYPDr VR256:$src, (i32 0x1))>; + def : Pat<(v4f64 (fnearbyint VR256:$src)), + (VROUNDYPDr VR256:$src, (i32 0xC))>; + def : Pat<(v4f64 (fceil VR256:$src)), + (VROUNDYPDr VR256:$src, (i32 0x2))>; + def : Pat<(v4f64 (frint VR256:$src)), + (VROUNDYPDr VR256:$src, (i32 0x4))>; + def : Pat<(v4f64 (ftrunc VR256:$src)), + (VROUNDYPDr VR256:$src, (i32 0x3))>; +} + +defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128, + memopv4f32, memopv2f64, + int_x86_sse41_round_ps, int_x86_sse41_round_pd>; +let Constraints = "$src1 = $dst" in +defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round", + int_x86_sse41_round_ss, int_x86_sse41_round_sd>; + +let Predicates = [UseSSE41] in { + def : Pat<(ffloor FR32:$src), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>; + def : Pat<(f64 (ffloor FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>; + def : Pat<(f32 (fnearbyint FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; + def : Pat<(f64 (fnearbyint FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; + def : Pat<(f32 (fceil FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>; + def : Pat<(f64 (fceil FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>; + def : Pat<(f32 (frint FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; + def : Pat<(f64 (frint FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; + def : Pat<(f32 (ftrunc FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>; + def : Pat<(f64 (ftrunc FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>; + + def : Pat<(v4f32 (ffloor VR128:$src)), + (ROUNDPSr VR128:$src, (i32 0x1))>; + def : Pat<(v4f32 (fnearbyint VR128:$src)), + (ROUNDPSr VR128:$src, (i32 0xC))>; + def : Pat<(v4f32 (fceil VR128:$src)), + (ROUNDPSr VR128:$src, (i32 0x2))>; + def : Pat<(v4f32 (frint VR128:$src)), + (ROUNDPSr VR128:$src, (i32 0x4))>; + def : Pat<(v4f32 (ftrunc VR128:$src)), + (ROUNDPSr VR128:$src, (i32 0x3))>; + + def : Pat<(v2f64 (ffloor VR128:$src)), + (ROUNDPDr VR128:$src, (i32 0x1))>; + def : Pat<(v2f64 (fnearbyint VR128:$src)), + (ROUNDPDr VR128:$src, (i32 0xC))>; + def : Pat<(v2f64 (fceil VR128:$src)), + (ROUNDPDr VR128:$src, (i32 0x2))>; + def : Pat<(v2f64 (frint VR128:$src)), + (ROUNDPDr VR128:$src, (i32 0x4))>; + def : Pat<(v2f64 (ftrunc VR128:$src)), + (ROUNDPDr VR128:$src, (i32 0x3))>; +} + +//===----------------------------------------------------------------------===// +// SSE4.1 - Packed Bit Test +//===----------------------------------------------------------------------===// + +// ptest instruction we'll lower to this in X86ISelLowering primarily from +// the intel intrinsic that corresponds to this. +let Defs = [EFLAGS], Predicates = [HasAVX] in { +def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), + "vptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, + OpSize, VEX; +def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), + "vptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS,(X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>, + OpSize, VEX; + +def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), + "vptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>, + OpSize, VEX, VEX_L; +def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2), + "vptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS,(X86ptest VR256:$src1, (memopv4i64 addr:$src2)))]>, + OpSize, VEX, VEX_L; +} + +let Defs = [EFLAGS] in { +def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), + "ptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, + OpSize; +def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), + "ptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>, + OpSize; +} + +// The bit test instructions below are AVX only +multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> { + def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, OpSize, VEX; + def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>, + OpSize, VEX; +} + +let Defs = [EFLAGS], Predicates = [HasAVX] in { +let ExeDomain = SSEPackedSingle in { +defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, memopv4f32, v4f32>; +defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>, + VEX_L; +} +let ExeDomain = SSEPackedDouble in { +defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, memopv2f64, v2f64>; +defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>, + VEX_L; +} +} + +//===----------------------------------------------------------------------===// +// SSE4.1 - Misc Instructions +//===----------------------------------------------------------------------===// + +let Defs = [EFLAGS], Predicates = [HasPOPCNT] in { + def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "popcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>, + OpSize, XS; + def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "popcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (ctpop (loadi16 addr:$src))), + (implicit EFLAGS)]>, OpSize, XS; + + def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "popcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>, + XS; + def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "popcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (ctpop (loadi32 addr:$src))), + (implicit EFLAGS)]>, XS; + + def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "popcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>, + XS; + def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "popcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (ctpop (loadi64 addr:$src))), + (implicit EFLAGS)]>, XS; +} + + + +// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16. +multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, + Intrinsic IntId128> { + def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (IntId128 VR128:$src))]>, OpSize; + def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (IntId128 + (bitconvert (memopv2i64 addr:$src))))]>, OpSize; +} + +let Predicates = [HasAVX] in +defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw", + int_x86_sse41_phminposuw>, VEX; +defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw", + int_x86_sse41_phminposuw>; + +/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator +multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr, + Intrinsic IntId128, bit Is2Addr = 1> { + let isCommutable = 1 in + def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, OpSize; + def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (IntId128 VR128:$src1, + (bitconvert (memopv2i64 addr:$src2))))]>, OpSize; +} + +/// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator +multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr, + Intrinsic IntId256> { + let isCommutable = 1 in + def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, OpSize; + def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (IntId256 VR256:$src1, + (bitconvert (memopv4i64 addr:$src2))))]>, OpSize; +} + + +/// SS48I_binop_rm - Simple SSE41 binary operator. +multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, bit Is2Addr = 1> { + let isCommutable = 1 in + def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, OpSize; + def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, + (bitconvert (memop_frag addr:$src2)))))]>, OpSize; +} + +let Predicates = [HasAVX] in { + let isCommutable = 0 in + defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw, + 0>, VEX_4V; + defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128, + memopv2i64, i128mem, 0>, VEX_4V; + defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", X86smin, v4i32, VR128, + memopv2i64, i128mem, 0>, VEX_4V; + defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", X86umin, v4i32, VR128, + memopv2i64, i128mem, 0>, VEX_4V; + defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v8i16, VR128, + memopv2i64, i128mem, 0>, VEX_4V; + defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v16i8, VR128, + memopv2i64, i128mem, 0>, VEX_4V; + defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v4i32, VR128, + memopv2i64, i128mem, 0>, VEX_4V; + defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v4i32, VR128, + memopv2i64, i128mem, 0>, VEX_4V; + defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128, + memopv2i64, i128mem, 0>, VEX_4V; + defm VPMULDQ : SS41I_binop_rm_int<0x28, "vpmuldq", int_x86_sse41_pmuldq, + 0>, VEX_4V; +} + +let Predicates = [HasAVX2] in { + let isCommutable = 0 in + defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw", + int_x86_avx2_packusdw>, VEX_4V, VEX_L; + defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256, + memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", X86smin, v8i32, VR256, + memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", X86umin, v8i32, VR256, + memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v16i16, VR256, + memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v32i8, VR256, + memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v8i32, VR256, + memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v8i32, VR256, + memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256, + memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + defm VPMULDQ : SS41I_binop_rm_int_y<0x28, "vpmuldq", + int_x86_avx2_pmul_dq>, VEX_4V, VEX_L; +} + +let Constraints = "$src1 = $dst" in { + let isCommutable = 0 in + defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw>; + defm PMINSB : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128, + memopv2i64, i128mem>; + defm PMINSD : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128, + memopv2i64, i128mem>; + defm PMINUD : SS48I_binop_rm<0x3B, "pminud", X86umin, v4i32, VR128, + memopv2i64, i128mem>; + defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", X86umin, v8i16, VR128, + memopv2i64, i128mem>; + defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", X86smax, v16i8, VR128, + memopv2i64, i128mem>; + defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", X86smax, v4i32, VR128, + memopv2i64, i128mem>; + defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", X86umax, v4i32, VR128, + memopv2i64, i128mem>; + defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128, + memopv2i64, i128mem>; + defm PMULDQ : SS41I_binop_rm_int<0x28, "pmuldq", int_x86_sse41_pmuldq>; +} + +let Predicates = [HasAVX] in { + defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, + memopv2i64, i128mem, 0>, VEX_4V; + defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, + memopv2i64, i128mem, 0>, VEX_4V; +} +let Predicates = [HasAVX2] in { + defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, + memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, + memopv4i64, i256mem, 0>, VEX_4V, VEX_L; +} + +let Constraints = "$src1 = $dst" in { + defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128, + memopv2i64, i128mem>; + defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128, + memopv2i64, i128mem>; +} + +/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate +multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, + Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, bit Is2Addr = 1> { + let isCommutable = 1 in + def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u32u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>, + OpSize; + def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, u32u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, + (IntId RC:$src1, + (bitconvert (memop_frag addr:$src2)), imm:$src3))]>, + OpSize; +} + +let Predicates = [HasAVX] in { + let isCommutable = 0 in { + let ExeDomain = SSEPackedSingle in { + defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, + VR128, memopv4f32, f128mem, 0>, VEX_4V; + defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps", + int_x86_avx_blend_ps_256, VR256, memopv8f32, + f256mem, 0>, VEX_4V, VEX_L; + } + let ExeDomain = SSEPackedDouble in { + defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd, + VR128, memopv2f64, f128mem, 0>, VEX_4V; + defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd", + int_x86_avx_blend_pd_256,VR256, memopv4f64, + f256mem, 0>, VEX_4V, VEX_L; + } + defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw, + VR128, memopv2i64, i128mem, 0>, VEX_4V; + defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, + VR128, memopv2i64, i128mem, 0>, VEX_4V; + } + let ExeDomain = SSEPackedSingle in + defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, + VR128, memopv4f32, f128mem, 0>, VEX_4V; + let ExeDomain = SSEPackedDouble in + defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, + VR128, memopv2f64, f128mem, 0>, VEX_4V; + let ExeDomain = SSEPackedSingle in + defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, + VR256, memopv8f32, i256mem, 0>, VEX_4V, VEX_L; +} + +let Predicates = [HasAVX2] in { + let isCommutable = 0 in { + defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw, + VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, + VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + } +} + +let Constraints = "$src1 = $dst" in { + let isCommutable = 0 in { + let ExeDomain = SSEPackedSingle in + defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps, + VR128, memopv4f32, f128mem>; + let ExeDomain = SSEPackedDouble in + defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd, + VR128, memopv2f64, f128mem>; + defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw, + VR128, memopv2i64, i128mem>; + defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, + VR128, memopv2i64, i128mem>; + } + let ExeDomain = SSEPackedSingle in + defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, + VR128, memopv4f32, f128mem>; + let ExeDomain = SSEPackedDouble in + defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, + VR128, memopv2f64, f128mem>; +} + +/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators +multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, + RegisterClass RC, X86MemOperand x86memop, + PatFrag mem_frag, Intrinsic IntId> { + def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))], + NoItinerary, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM; + + def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, + (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)), + RC:$src3))], + NoItinerary, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM; +} + +let Predicates = [HasAVX] in { +let ExeDomain = SSEPackedDouble in { +defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem, + memopv2f64, int_x86_sse41_blendvpd>; +defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem, + memopv4f64, int_x86_avx_blendv_pd_256>, VEX_L; +} // ExeDomain = SSEPackedDouble +let ExeDomain = SSEPackedSingle in { +defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem, + memopv4f32, int_x86_sse41_blendvps>; +defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem, + memopv8f32, int_x86_avx_blendv_ps_256>, VEX_L; +} // ExeDomain = SSEPackedSingle +defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem, + memopv2i64, int_x86_sse41_pblendvb>; +} + +let Predicates = [HasAVX2] in { +defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem, + memopv4i64, int_x86_avx2_pblendvb>, VEX_L; +} + +let Predicates = [HasAVX] in { + def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1), + (v16i8 VR128:$src2))), + (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>; + def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1), + (v4i32 VR128:$src2))), + (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; + def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1), + (v4f32 VR128:$src2))), + (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; + def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1), + (v2i64 VR128:$src2))), + (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; + def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1), + (v2f64 VR128:$src2))), + (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; + def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1), + (v8i32 VR256:$src2))), + (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; + def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1), + (v8f32 VR256:$src2))), + (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; + def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1), + (v4i64 VR256:$src2))), + (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; + def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1), + (v4f64 VR256:$src2))), + (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; + + def : Pat<(v8f32 (X86Blendi (v8f32 VR256:$src1), (v8f32 VR256:$src2), + (imm:$mask))), + (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$mask)>; + def : Pat<(v4f64 (X86Blendi (v4f64 VR256:$src1), (v4f64 VR256:$src2), + (imm:$mask))), + (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$mask)>; + + def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2), + (imm:$mask))), + (VPBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2), + (imm:$mask))), + (VBLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2), + (imm:$mask))), + (VBLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>; +} + +let Predicates = [HasAVX2] in { + def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1), + (v32i8 VR256:$src2))), + (VPBLENDVBYrr VR256:$src1, VR256:$src2, VR256:$mask)>; + def : Pat<(v16i16 (X86Blendi (v16i16 VR256:$src1), (v16i16 VR256:$src2), + (imm:$mask))), + (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>; +} + +/// SS41I_ternary_int - SSE 4.1 ternary operator +let Uses = [XMM0], Constraints = "$src1 = $dst" in { + multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag, + X86MemOperand x86memop, Intrinsic IntId> { + def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>, + OpSize; + + def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, x86memop:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, + (IntId VR128:$src1, + (bitconvert (mem_frag addr:$src2)), XMM0))]>, OpSize; + } +} + +let ExeDomain = SSEPackedDouble in +defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem, + int_x86_sse41_blendvpd>; +let ExeDomain = SSEPackedSingle in +defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem, + int_x86_sse41_blendvps>; +defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem, + int_x86_sse41_pblendvb>; + +// Aliases with the implicit xmm0 argument +def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", + (BLENDVPDrr0 VR128:$dst, VR128:$src2)>; +def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", + (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>; +def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", + (BLENDVPSrr0 VR128:$dst, VR128:$src2)>; +def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", + (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>; +def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", + (PBLENDVBrr0 VR128:$dst, VR128:$src2)>; +def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", + (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>; + +let Predicates = [UseSSE41] in { + def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1), + (v16i8 VR128:$src2))), + (PBLENDVBrr0 VR128:$src2, VR128:$src1)>; + def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1), + (v4i32 VR128:$src2))), + (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; + def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1), + (v4f32 VR128:$src2))), + (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; + def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1), + (v2i64 VR128:$src2))), + (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; + def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1), + (v2f64 VR128:$src2))), + (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; + + def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2), + (imm:$mask))), + (PBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2), + (imm:$mask))), + (BLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2), + (imm:$mask))), + (BLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>; + +} + +let Predicates = [HasAVX] in +def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "vmovntdqa\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>, + OpSize, VEX; +let Predicates = [HasAVX2] in +def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "vmovntdqa\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>, + OpSize, VEX, VEX_L; +def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "movntdqa\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>, + OpSize; + +//===----------------------------------------------------------------------===// +// SSE4.2 - Compare Instructions +//===----------------------------------------------------------------------===// + +/// SS42I_binop_rm - Simple SSE 4.2 binary operator +multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, bit Is2Addr = 1> { + def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, + OpSize; + def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, OpSize; +} + +let Predicates = [HasAVX] in + defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128, + memopv2i64, i128mem, 0>, VEX_4V; + +let Predicates = [HasAVX2] in + defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256, + memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + +let Constraints = "$src1 = $dst" in + defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128, + memopv2i64, i128mem>; + +//===----------------------------------------------------------------------===// +// SSE4.2 - String/text Processing Instructions +//===----------------------------------------------------------------------===// + +// Packed Compare Implicit Length Strings, Return Mask +multiclass pseudo_pcmpistrm<string asm> { + def REG : PseudoI<(outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), + [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2, + imm:$src3))]>; + def MEM : PseudoI<(outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, + (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>; +} + +let Defs = [EFLAGS], usesCustomInserter = 1 in { + defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>; + defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[UseSSE42]>; +} + +multiclass pcmpistrm_SS42AI<string asm> { + def rr : SS42AI<0x62, MRMSrcReg, (outs), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), + []>, OpSize; + let mayLoad = 1 in + def rm :SS42AI<0x62, MRMSrcMem, (outs), + (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), + []>, OpSize; +} + +let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1 in { + let Predicates = [HasAVX] in + defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX; + defm PCMPISTRM128 : pcmpistrm_SS42AI<"pcmpistrm"> ; +} + +// Packed Compare Explicit Length Strings, Return Mask +multiclass pseudo_pcmpestrm<string asm> { + def REG : PseudoI<(outs VR128:$dst), + (ins VR128:$src1, VR128:$src3, i8imm:$src5), + [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 + VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>; + def MEM : PseudoI<(outs VR128:$dst), + (ins VR128:$src1, i128mem:$src3, i8imm:$src5), + [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX, + (bc_v16i8 (memopv2i64 addr:$src3)), EDX, imm:$src5))]>; +} + +let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { + defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>; + defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[UseSSE42]>; +} + +multiclass SS42AI_pcmpestrm<string asm> { + def rr : SS42AI<0x60, MRMSrcReg, (outs), + (ins VR128:$src1, VR128:$src3, i8imm:$src5), + !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), + []>, OpSize; + let mayLoad = 1 in + def rm : SS42AI<0x60, MRMSrcMem, (outs), + (ins VR128:$src1, i128mem:$src3, i8imm:$src5), + !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), + []>, OpSize; +} + +let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in { + let Predicates = [HasAVX] in + defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX; + defm PCMPESTRM128 : SS42AI_pcmpestrm<"pcmpestrm">; +} + +// Packed Compare Implicit Length Strings, Return Index +multiclass pseudo_pcmpistri<string asm> { + def REG : PseudoI<(outs GR32:$dst), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), + [(set GR32:$dst, EFLAGS, + (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>; + def MEM : PseudoI<(outs GR32:$dst), + (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1, + (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>; +} + +let Defs = [EFLAGS], usesCustomInserter = 1 in { + defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI">, Requires<[HasAVX]>; + defm PCMPISTRI : pseudo_pcmpistri<"#PCMPISTRI">, Requires<[UseSSE42]>; +} + +multiclass SS42AI_pcmpistri<string asm> { + def rr : SS42AI<0x63, MRMSrcReg, (outs), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), + []>, OpSize; + let mayLoad = 1 in + def rm : SS42AI<0x63, MRMSrcMem, (outs), + (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), + []>, OpSize; +} + +let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in { + let Predicates = [HasAVX] in + defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX; + defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">; +} + +// Packed Compare Explicit Length Strings, Return Index +multiclass pseudo_pcmpestri<string asm> { + def REG : PseudoI<(outs GR32:$dst), + (ins VR128:$src1, VR128:$src3, i8imm:$src5), + [(set GR32:$dst, EFLAGS, + (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>; + def MEM : PseudoI<(outs GR32:$dst), + (ins VR128:$src1, i128mem:$src3, i8imm:$src5), + [(set GR32:$dst, EFLAGS, + (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (memopv2i64 addr:$src3)), EDX, + imm:$src5))]>; +} + +let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { + defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI">, Requires<[HasAVX]>; + defm PCMPESTRI : pseudo_pcmpestri<"#PCMPESTRI">, Requires<[UseSSE42]>; +} + +multiclass SS42AI_pcmpestri<string asm> { + def rr : SS42AI<0x61, MRMSrcReg, (outs), + (ins VR128:$src1, VR128:$src3, i8imm:$src5), + !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), + []>, OpSize; + let mayLoad = 1 in + def rm : SS42AI<0x61, MRMSrcMem, (outs), + (ins VR128:$src1, i128mem:$src3, i8imm:$src5), + !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), + []>, OpSize; +} + +let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in { + let Predicates = [HasAVX] in + defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX; + defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">; +} + +//===----------------------------------------------------------------------===// +// SSE4.2 - CRC Instructions +//===----------------------------------------------------------------------===// + +// No CRC instructions have AVX equivalents + +// crc intrinsic instruction +// This set of instructions are only rm, the only difference is the size +// of r and m. +let Constraints = "$src1 = $dst" in { + def CRC32r32m8 : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$src1, i8mem:$src2), + "crc32{b} \t{$src2, $src1|$src1, $src2}", + [(set GR32:$dst, + (int_x86_sse42_crc32_32_8 GR32:$src1, + (load addr:$src2)))]>; + def CRC32r32r8 : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst), + (ins GR32:$src1, GR8:$src2), + "crc32{b} \t{$src2, $src1|$src1, $src2}", + [(set GR32:$dst, + (int_x86_sse42_crc32_32_8 GR32:$src1, GR8:$src2))]>; + def CRC32r32m16 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$src1, i16mem:$src2), + "crc32{w} \t{$src2, $src1|$src1, $src2}", + [(set GR32:$dst, + (int_x86_sse42_crc32_32_16 GR32:$src1, + (load addr:$src2)))]>, + OpSize; + def CRC32r32r16 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst), + (ins GR32:$src1, GR16:$src2), + "crc32{w} \t{$src2, $src1|$src1, $src2}", + [(set GR32:$dst, + (int_x86_sse42_crc32_32_16 GR32:$src1, GR16:$src2))]>, + OpSize; + def CRC32r32m32 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$src1, i32mem:$src2), + "crc32{l} \t{$src2, $src1|$src1, $src2}", + [(set GR32:$dst, + (int_x86_sse42_crc32_32_32 GR32:$src1, + (load addr:$src2)))]>; + def CRC32r32r32 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst), + (ins GR32:$src1, GR32:$src2), + "crc32{l} \t{$src2, $src1|$src1, $src2}", + [(set GR32:$dst, + (int_x86_sse42_crc32_32_32 GR32:$src1, GR32:$src2))]>; + def CRC32r64m8 : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$src1, i8mem:$src2), + "crc32{b} \t{$src2, $src1|$src1, $src2}", + [(set GR64:$dst, + (int_x86_sse42_crc32_64_8 GR64:$src1, + (load addr:$src2)))]>, + REX_W; + def CRC32r64r8 : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst), + (ins GR64:$src1, GR8:$src2), + "crc32{b} \t{$src2, $src1|$src1, $src2}", + [(set GR64:$dst, + (int_x86_sse42_crc32_64_8 GR64:$src1, GR8:$src2))]>, + REX_W; + def CRC32r64m64 : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$src1, i64mem:$src2), + "crc32{q} \t{$src2, $src1|$src1, $src2}", + [(set GR64:$dst, + (int_x86_sse42_crc32_64_64 GR64:$src1, + (load addr:$src2)))]>, + REX_W; + def CRC32r64r64 : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2), + "crc32{q} \t{$src2, $src1|$src1, $src2}", + [(set GR64:$dst, + (int_x86_sse42_crc32_64_64 GR64:$src1, GR64:$src2))]>, + REX_W; +} + +//===----------------------------------------------------------------------===// +// AES-NI Instructions +//===----------------------------------------------------------------------===// + +multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, + Intrinsic IntId128, bit Is2Addr = 1> { + def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, + OpSize; + def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (IntId128 VR128:$src1, (memopv2i64 addr:$src2)))]>, OpSize; +} + +// Perform One Round of an AES Encryption/Decryption Flow +let Predicates = [HasAVX, HasAES] in { + defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", + int_x86_aesni_aesenc, 0>, VEX_4V; + defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", + int_x86_aesni_aesenclast, 0>, VEX_4V; + defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec", + int_x86_aesni_aesdec, 0>, VEX_4V; + defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast", + int_x86_aesni_aesdeclast, 0>, VEX_4V; +} + +let Constraints = "$src1 = $dst" in { + defm AESENC : AESI_binop_rm_int<0xDC, "aesenc", + int_x86_aesni_aesenc>; + defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast", + int_x86_aesni_aesenclast>; + defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec", + int_x86_aesni_aesdec>; + defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast", + int_x86_aesni_aesdeclast>; +} + +// Perform the AES InvMixColumn Transformation +let Predicates = [HasAVX, HasAES] in { + def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1), + "vaesimc\t{$src1, $dst|$dst, $src1}", + [(set VR128:$dst, + (int_x86_aesni_aesimc VR128:$src1))]>, + OpSize, VEX; + def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1), + "vaesimc\t{$src1, $dst|$dst, $src1}", + [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>, + OpSize, VEX; +} +def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1), + "aesimc\t{$src1, $dst|$dst, $src1}", + [(set VR128:$dst, + (int_x86_aesni_aesimc VR128:$src1))]>, + OpSize; +def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1), + "aesimc\t{$src1, $dst|$dst, $src1}", + [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>, + OpSize; + +// AES Round Key Generation Assist +let Predicates = [HasAVX, HasAES] in { + def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, i8imm:$src2), + "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, + OpSize, VEX; + def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1, i8imm:$src2), + "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>, + OpSize, VEX; +} +def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, i8imm:$src2), + "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, + OpSize; +def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1, i8imm:$src2), + "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>, + OpSize; + +//===----------------------------------------------------------------------===// +// PCLMUL Instructions +//===----------------------------------------------------------------------===// + +// AVX carry-less Multiplication instructions +def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), + "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR128:$dst, + (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>; + +def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, + (memopv2i64 addr:$src2), imm:$src3))]>; + +// Carry-less Multiplication instructions +let Constraints = "$src1 = $dst" in { +def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), + "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, + (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>; + +def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, + (memopv2i64 addr:$src2), imm:$src3))]>; +} // Constraints = "$src1 = $dst" + + +multiclass pclmul_alias<string asm, int immop> { + def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"), + (PCLMULQDQrr VR128:$dst, VR128:$src, immop)>; + + def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"), + (PCLMULQDQrm VR128:$dst, i128mem:$src, immop)>; + + def : InstAlias<!strconcat("vpclmul", asm, + "dq {$src2, $src1, $dst|$dst, $src1, $src2}"), + (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop)>; + + def : InstAlias<!strconcat("vpclmul", asm, + "dq {$src2, $src1, $dst|$dst, $src1, $src2}"), + (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop)>; +} +defm : pclmul_alias<"hqhq", 0x11>; +defm : pclmul_alias<"hqlq", 0x01>; +defm : pclmul_alias<"lqhq", 0x10>; +defm : pclmul_alias<"lqlq", 0x00>; + +//===----------------------------------------------------------------------===// +// SSE4A Instructions +//===----------------------------------------------------------------------===// + +let Predicates = [HasSSE4A] in { + +let Constraints = "$src = $dst" in { +def EXTRQI : Ii8<0x78, MRM0r, (outs VR128:$dst), + (ins VR128:$src, i8imm:$len, i8imm:$idx), + "extrq\t{$idx, $len, $src|$src, $len, $idx}", + [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len, + imm:$idx))]>, TB, OpSize; +def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src, VR128:$mask), + "extrq\t{$mask, $src|$src, $mask}", + [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src, + VR128:$mask))]>, TB, OpSize; + +def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src, VR128:$src2, i8imm:$len, i8imm:$idx), + "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}", + [(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src, + VR128:$src2, imm:$len, imm:$idx))]>, XD; +def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src, VR128:$mask), + "insertq\t{$mask, $src|$src, $mask}", + [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src, + VR128:$mask))]>, XD; +} + +def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src), + "movntss\t{$src, $dst|$dst, $src}", + [(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS; + +def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movntsd\t{$src, $dst|$dst, $src}", + [(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD; +} + +//===----------------------------------------------------------------------===// +// AVX Instructions +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// VBROADCAST - Load from memory and broadcast to all elements of the +// destination operand +// +class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, Intrinsic Int> : + AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (Int addr:$src))]>, VEX; + +// AVX2 adds register forms +class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC, + Intrinsic Int> : + AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (Int VR128:$src))]>, VEX; + +let ExeDomain = SSEPackedSingle in { + def VBROADCASTSSrm : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem, + int_x86_avx_vbroadcast_ss>; + def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem, + int_x86_avx_vbroadcast_ss_256>, VEX_L; +} +let ExeDomain = SSEPackedDouble in +def VBROADCASTSDYrm : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem, + int_x86_avx_vbroadcast_sd_256>, VEX_L; +def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem, + int_x86_avx_vbroadcastf128_pd_256>, VEX_L; + +let ExeDomain = SSEPackedSingle in { + def VBROADCASTSSrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR128, + int_x86_avx2_vbroadcast_ss_ps>; + def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256, + int_x86_avx2_vbroadcast_ss_ps_256>, VEX_L; +} +let ExeDomain = SSEPackedDouble in +def VBROADCASTSDYrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256, + int_x86_avx2_vbroadcast_sd_pd_256>, VEX_L; + +let Predicates = [HasAVX2] in +def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem, + int_x86_avx2_vbroadcasti128>, VEX_L; + +let Predicates = [HasAVX] in +def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), + (VBROADCASTF128 addr:$src)>; + + +//===----------------------------------------------------------------------===// +// VINSERTF128 - Insert packed floating-point values +// +let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in { +def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR128:$src2, i8imm:$src3), + "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, VEX_4V, VEX_L; +let mayLoad = 1 in +def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f128mem:$src2, i8imm:$src3), + "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, VEX_4V, VEX_L; +} + +let Predicates = [HasAVX] in { +def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2), + (iPTR imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2), + (iPTR imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; + +def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (memopv4f32 addr:$src2), + (iPTR imm)), + (VINSERTF128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (memopv2f64 addr:$src2), + (iPTR imm)), + (VINSERTF128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +} + +let Predicates = [HasAVX1Only] in { +def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), + (iPTR imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), + (iPTR imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), + (iPTR imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), + (iPTR imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; + +def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2), + (iPTR imm)), + (VINSERTF128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), + (bc_v4i32 (memopv2i64 addr:$src2)), + (iPTR imm)), + (VINSERTF128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), + (bc_v16i8 (memopv2i64 addr:$src2)), + (iPTR imm)), + (VINSERTF128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), + (bc_v8i16 (memopv2i64 addr:$src2)), + (iPTR imm)), + (VINSERTF128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +} + +//===----------------------------------------------------------------------===// +// VEXTRACTF128 - Extract packed floating-point values +// +let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in { +def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), + (ins VR256:$src1, i8imm:$src2), + "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, VEX, VEX_L; +let mayStore = 1 in +def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), + (ins f128mem:$dst, VR256:$src1, i8imm:$src2), + "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, VEX, VEX_L; +} + +// AVX1 patterns +let Predicates = [HasAVX] in { +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (v4f32 (VEXTRACTF128rr + (v8f32 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (v2f64 (VEXTRACTF128rr + (v4f64 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; + +def : Pat<(alignedstore (v4f32 (vextractf128_extract:$ext (v8f32 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +def : Pat<(alignedstore (v2f64 (vextractf128_extract:$ext (v4f64 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +} + +let Predicates = [HasAVX1Only] in { +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (v2i64 (VEXTRACTF128rr + (v4i64 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (v4i32 (VEXTRACTF128rr + (v8i32 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (v8i16 (VEXTRACTF128rr + (v16i16 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (v16i8 (VEXTRACTF128rr + (v32i8 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; + +def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +} + +//===----------------------------------------------------------------------===// +// VMASKMOV - Conditional SIMD Packed Loads and Stores +// +multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, + Intrinsic IntLd, Intrinsic IntLd256, + Intrinsic IntSt, Intrinsic IntSt256> { + def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>, + VEX_4V; + def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, + VEX_4V, VEX_L; + def mr : AVX8I<opc_mr, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V; + def Ymr : AVX8I<opc_mr, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L; +} + +let ExeDomain = SSEPackedSingle in +defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps", + int_x86_avx_maskload_ps, + int_x86_avx_maskload_ps_256, + int_x86_avx_maskstore_ps, + int_x86_avx_maskstore_ps_256>; +let ExeDomain = SSEPackedDouble in +defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", + int_x86_avx_maskload_pd, + int_x86_avx_maskload_pd_256, + int_x86_avx_maskstore_pd, + int_x86_avx_maskstore_pd_256>; + +//===----------------------------------------------------------------------===// +// VPERMIL - Permute Single and Double Floating-Point Values +// +multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, + RegisterClass RC, X86MemOperand x86memop_f, + X86MemOperand x86memop_i, PatFrag i_frag, + Intrinsic IntVar, ValueType vt> { + def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (IntVar RC:$src1, RC:$src2))]>, VEX_4V; + def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop_i:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (IntVar RC:$src1, + (bitconvert (i_frag addr:$src2))))]>, VEX_4V; + + def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, i8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (vt (X86VPermilp RC:$src1, (i8 imm:$src2))))]>, VEX; + def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), + (ins x86memop_f:$src1, i8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, + (vt (X86VPermilp (memop addr:$src1), (i8 imm:$src2))))]>, VEX; +} + +let ExeDomain = SSEPackedSingle in { + defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem, + memopv2i64, int_x86_avx_vpermilvar_ps, v4f32>; + defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, + memopv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L; +} +let ExeDomain = SSEPackedDouble in { + defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem, + memopv2i64, int_x86_avx_vpermilvar_pd, v2f64>; + defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem, + memopv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L; +} + +let Predicates = [HasAVX] in { +def : Pat<(v8i32 (X86VPermilp VR256:$src1, (i8 imm:$imm))), + (VPERMILPSYri VR256:$src1, imm:$imm)>; +def : Pat<(v4i64 (X86VPermilp VR256:$src1, (i8 imm:$imm))), + (VPERMILPDYri VR256:$src1, imm:$imm)>; +def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (memopv4i64 addr:$src1)), + (i8 imm:$imm))), + (VPERMILPSYmi addr:$src1, imm:$imm)>; +def : Pat<(v4i64 (X86VPermilp (memopv4i64 addr:$src1), (i8 imm:$imm))), + (VPERMILPDYmi addr:$src1, imm:$imm)>; + +def : Pat<(v2i64 (X86VPermilp VR128:$src1, (i8 imm:$imm))), + (VPERMILPDri VR128:$src1, imm:$imm)>; +def : Pat<(v2i64 (X86VPermilp (memopv2i64 addr:$src1), (i8 imm:$imm))), + (VPERMILPDmi addr:$src1, imm:$imm)>; +} + +//===----------------------------------------------------------------------===// +// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks +// +let ExeDomain = SSEPackedSingle in { +def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, i8imm:$src3), + "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2, + (i8 imm:$src3))))]>, VEX_4V, VEX_L; +def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2, i8imm:$src3), + "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv8f32 addr:$src2), + (i8 imm:$src3)))]>, VEX_4V, VEX_L; +} + +let Predicates = [HasAVX] in { +def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, + (memopv4f64 addr:$src2), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +} + +let Predicates = [HasAVX1Only] in { +def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; + +def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, + (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, + (memopv4i64 addr:$src2), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, + (bc_v32i8 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, + (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +} + +//===----------------------------------------------------------------------===// +// VZERO - Zero YMM registers +// +let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, + YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in { + // Zero All YMM registers + def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", + [(int_x86_avx_vzeroall)]>, TB, VEX, VEX_L, Requires<[HasAVX]>; + + // Zero Upper bits of YMM registers + def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", + [(int_x86_avx_vzeroupper)]>, TB, VEX, Requires<[HasAVX]>; +} + +//===----------------------------------------------------------------------===// +// Half precision conversion instructions +//===----------------------------------------------------------------------===// +multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { + def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), + "vcvtph2ps\t{$src, $dst|$dst, $src}", + [(set RC:$dst, (Int VR128:$src))]>, + T8, OpSize, VEX; + let neverHasSideEffects = 1, mayLoad = 1 in + def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX; +} + +multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { + def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), + (ins RC:$src1, i32i8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>, + TA, OpSize, VEX; + let neverHasSideEffects = 1, mayStore = 1 in + def mr : Ii8<0x1D, MRMDestMem, (outs), + (ins x86memop:$dst, RC:$src1, i32i8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + TA, OpSize, VEX; +} + +let Predicates = [HasAVX, HasF16C] in { + defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>; + defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L; + defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>; + defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L; +} + +//===----------------------------------------------------------------------===// +// AVX2 Instructions +//===----------------------------------------------------------------------===// + +/// AVX2_binop_rmi_int - AVX2 binary operator with 8-bit immediate +multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr, + Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop> { + let isCommutable = 1 in + def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u32u8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>, + VEX_4V; + def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, u32u8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, + (IntId RC:$src1, + (bitconvert (memop_frag addr:$src2)), imm:$src3))]>, + VEX_4V; +} + +let isCommutable = 0 in { +defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128, + VR128, memopv2i64, i128mem>; +defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256, + VR256, memopv4i64, i256mem>, VEX_L; +} + +def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), + imm:$mask)), + (VPBLENDDrri VR128:$src1, VR128:$src2, imm:$mask)>; +def : Pat<(v8i32 (X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), + imm:$mask)), + (VPBLENDDYrri VR256:$src1, VR256:$src2, imm:$mask)>; + +//===----------------------------------------------------------------------===// +// VPBROADCAST - Load from memory and broadcast to all elements of the +// destination operand +// +multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, + X86MemOperand x86memop, PatFrag ld_frag, + Intrinsic Int128, Intrinsic Int256> { + def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int128 VR128:$src))]>, VEX; + def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (Int128 (scalar_to_vector (ld_frag addr:$src))))]>, VEX; + def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (Int256 VR128:$src))]>, VEX, VEX_L; + def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, + (Int256 (scalar_to_vector (ld_frag addr:$src))))]>, + VEX, VEX_L; +} + +defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, + int_x86_avx2_pbroadcastb_128, + int_x86_avx2_pbroadcastb_256>; +defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16, + int_x86_avx2_pbroadcastw_128, + int_x86_avx2_pbroadcastw_256>; +defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, + int_x86_avx2_pbroadcastd_128, + int_x86_avx2_pbroadcastd_256>; +defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, + int_x86_avx2_pbroadcastq_128, + int_x86_avx2_pbroadcastq_256>; + +let Predicates = [HasAVX2] in { + def : Pat<(v16i8 (X86VBroadcast (loadi8 addr:$src))), + (VPBROADCASTBrm addr:$src)>; + def : Pat<(v32i8 (X86VBroadcast (loadi8 addr:$src))), + (VPBROADCASTBYrm addr:$src)>; + def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWrm addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWYrm addr:$src)>; + def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), + (VPBROADCASTDrm addr:$src)>; + def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), + (VPBROADCASTDYrm addr:$src)>; + def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), + (VPBROADCASTQrm addr:$src)>; + def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), + (VPBROADCASTQYrm addr:$src)>; + + def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))), + (VPBROADCASTBrr VR128:$src)>; + def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))), + (VPBROADCASTBYrr VR128:$src)>; + def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))), + (VPBROADCASTWrr VR128:$src)>; + def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))), + (VPBROADCASTWYrr VR128:$src)>; + def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))), + (VPBROADCASTDrr VR128:$src)>; + def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))), + (VPBROADCASTDYrr VR128:$src)>; + def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))), + (VPBROADCASTQrr VR128:$src)>; + def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))), + (VPBROADCASTQYrr VR128:$src)>; + def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))), + (VBROADCASTSSrr VR128:$src)>; + def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))), + (VBROADCASTSSYrr VR128:$src)>; + def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))), + (VPBROADCASTQrr VR128:$src)>; + def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))), + (VBROADCASTSDYrr VR128:$src)>; + + // Provide fallback in case the load node that is used in the patterns above + // is used by additional users, which prevents the pattern selection. + let AddedComplexity = 20 in { + def : Pat<(v4f32 (X86VBroadcast FR32:$src)), + (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v8f32 (X86VBroadcast FR32:$src)), + (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f64 (X86VBroadcast FR64:$src)), + (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>; + + def : Pat<(v4i32 (X86VBroadcast GR32:$src)), + (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>; + def : Pat<(v8i32 (X86VBroadcast GR32:$src)), + (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>; + def : Pat<(v4i64 (X86VBroadcast GR64:$src)), + (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>; + } +} + +// AVX1 broadcast patterns +let Predicates = [HasAVX1Only] in { +def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), + (VBROADCASTSSYrm addr:$src)>; +def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), + (VBROADCASTSDYrm addr:$src)>; +def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), + (VBROADCASTSSrm addr:$src)>; +} + +let Predicates = [HasAVX] in { +def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))), + (VBROADCASTSSYrm addr:$src)>; +def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))), + (VBROADCASTSDYrm addr:$src)>; +def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))), + (VBROADCASTSSrm addr:$src)>; + + // Provide fallback in case the load node that is used in the patterns above + // is used by additional users, which prevents the pattern selection. + let AddedComplexity = 20 in { + // 128bit broadcasts: + def : Pat<(v4f32 (X86VBroadcast FR32:$src)), + (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>; + def : Pat<(v8f32 (X86VBroadcast FR32:$src)), + (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), + (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm), + (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>; + def : Pat<(v4f64 (X86VBroadcast FR64:$src)), + (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), + (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm), + (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>; + + def : Pat<(v4i32 (X86VBroadcast GR32:$src)), + (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>; + def : Pat<(v8i32 (X86VBroadcast GR32:$src)), + (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm), + (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>; + def : Pat<(v4i64 (X86VBroadcast GR64:$src)), + (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), + (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm), + (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>; + } +} + +//===----------------------------------------------------------------------===// +// VPERM - Permute instructions +// + +multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, + ValueType OpVT> { + def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, + VEX_4V, VEX_L; + def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OpVT (X86VPermv VR256:$src1, + (bitconvert (mem_frag addr:$src2)))))]>, + VEX_4V, VEX_L; +} + +defm VPERMD : avx2_perm<0x36, "vpermd", memopv4i64, v8i32>; +let ExeDomain = SSEPackedSingle in +defm VPERMPS : avx2_perm<0x16, "vpermps", memopv8f32, v8f32>; + +multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, + ValueType OpVT> { + def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>, + VEX, VEX_L; + def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), + (ins i256mem:$src1, i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OpVT (X86VPermi (mem_frag addr:$src1), + (i8 imm:$src2))))]>, VEX, VEX_L; +} + +defm VPERMQ : avx2_perm_imm<0x00, "vpermq", memopv4i64, v4i64>, VEX_W; +let ExeDomain = SSEPackedDouble in +defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", memopv4f64, v4f64>, VEX_W; + +//===----------------------------------------------------------------------===// +// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks +// +def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, i8imm:$src3), + "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, + (i8 imm:$src3))))]>, VEX_4V, VEX_L; +def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2, i8imm:$src3), + "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv4i64 addr:$src2), + (i8 imm:$src3)))]>, VEX_4V, VEX_L; + +let Predicates = [HasAVX2] in { +def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; + +def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (memopv4i64 addr:$src2)), + (i8 imm:$imm))), + (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, + (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)), + (i8 imm:$imm))), + (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; +} + + +//===----------------------------------------------------------------------===// +// VINSERTI128 - Insert packed integer values +// +let neverHasSideEffects = 1 in { +def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR128:$src2, i8imm:$src3), + "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, VEX_4V, VEX_L; +let mayLoad = 1 in +def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i128mem:$src2, i8imm:$src3), + "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, VEX_4V, VEX_L; +} + +let Predicates = [HasAVX2] in { +def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), + (iPTR imm)), + (VINSERTI128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), + (iPTR imm)), + (VINSERTI128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), + (iPTR imm)), + (VINSERTI128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), + (iPTR imm)), + (VINSERTI128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; + +def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2), + (iPTR imm)), + (VINSERTI128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), + (bc_v4i32 (memopv2i64 addr:$src2)), + (iPTR imm)), + (VINSERTI128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), + (bc_v16i8 (memopv2i64 addr:$src2)), + (iPTR imm)), + (VINSERTI128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), + (bc_v8i16 (memopv2i64 addr:$src2)), + (iPTR imm)), + (VINSERTI128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +} + +//===----------------------------------------------------------------------===// +// VEXTRACTI128 - Extract packed integer values +// +def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst), + (ins VR256:$src1, i8imm:$src2), + "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_avx2_vextracti128 VR256:$src1, imm:$src2))]>, + VEX, VEX_L; +let neverHasSideEffects = 1, mayStore = 1 in +def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), + (ins i128mem:$dst, VR256:$src1, i8imm:$src2), + "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + VEX, VEX_L; + +let Predicates = [HasAVX2] in { +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (v2i64 (VEXTRACTI128rr + (v4i64 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (v4i32 (VEXTRACTI128rr + (v8i32 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (v8i16 (VEXTRACTI128rr + (v16i16 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (v16i8 (VEXTRACTI128rr + (v32i8 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; + +def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTI128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTI128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTI128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTI128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +} + +//===----------------------------------------------------------------------===// +// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores +// +multiclass avx2_pmovmask<string OpcodeStr, + Intrinsic IntLd128, Intrinsic IntLd256, + Intrinsic IntSt128, Intrinsic IntSt256> { + def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, VEX_4V; + def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, + VEX_4V, VEX_L; + def mr : AVX28I<0x8e, MRMDestMem, (outs), + (ins i128mem:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V; + def Ymr : AVX28I<0x8e, MRMDestMem, (outs), + (ins i256mem:$dst, VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L; +} + +defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd", + int_x86_avx2_maskload_d, + int_x86_avx2_maskload_d_256, + int_x86_avx2_maskstore_d, + int_x86_avx2_maskstore_d_256>; +defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", + int_x86_avx2_maskload_q, + int_x86_avx2_maskload_q_256, + int_x86_avx2_maskstore_q, + int_x86_avx2_maskstore_q_256>, VEX_W; + + +//===----------------------------------------------------------------------===// +// Variable Bit Shifts +// +multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType vt128, ValueType vt256> { + def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>, + VEX_4V; + def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode VR128:$src1, + (vt128 (bitconvert (memopv2i64 addr:$src2))))))]>, + VEX_4V; + def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>, + VEX_4V, VEX_L; + def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (vt256 (OpNode VR256:$src1, + (vt256 (bitconvert (memopv4i64 addr:$src2))))))]>, + VEX_4V, VEX_L; +} + +defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>; +defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W; +defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>; +defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W; +defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>; + +//===----------------------------------------------------------------------===// +// VGATHER - GATHER Operations +multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256, + X86MemOperand memop128, X86MemOperand memop256> { + def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst, VR128:$mask_wb), + (ins VR128:$src1, memop128:$src2, VR128:$mask), + !strconcat(OpcodeStr, + "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), + []>, VEX_4VOp3; + def Yrm : AVX28I<opc, MRMSrcMem, (outs RC256:$dst, RC256:$mask_wb), + (ins RC256:$src1, memop256:$src2, RC256:$mask), + !strconcat(OpcodeStr, + "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), + []>, VEX_4VOp3, VEX_L; +} + +let mayLoad = 1, Constraints = "$src1 = $dst, $mask = $mask_wb" in { + defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W; + defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W; + defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>; + defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>; + defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx64mem, vx64mem>, VEX_W; + defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx64mem, vy64mem>, VEX_W; + defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx32mem, vy32mem>; + defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx32mem, vy32mem>; +} diff --git a/contrib/llvm/lib/Target/X86/X86InstrSVM.td b/contrib/llvm/lib/Target/X86/X86InstrSVM.td new file mode 100644 index 0000000..757dcd0 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrSVM.td @@ -0,0 +1,62 @@ +//===-- X86InstrSVM.td - SVM Instruction Set Extension -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the instructions that make up the AMD SVM instruction +// set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// SVM instructions + +// 0F 01 D9 +def VMMCALL : I<0x01, MRM_D9, (outs), (ins), "vmmcall", []>, TB; + +// 0F 01 DC +def STGI : I<0x01, MRM_DC, (outs), (ins), "stgi", []>, TB; + +// 0F 01 DD +def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB; + +// 0F 01 DE +let Uses = [EAX] in +def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|EAX}", []>, TB; + +// 0F 01 D8 +let Uses = [EAX] in +def VMRUN32 : I<0x01, MRM_D8, (outs), (ins), + "vmrun\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>; +let Uses = [RAX] in +def VMRUN64 : I<0x01, MRM_D8, (outs), (ins), + "vmrun\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>; + +// 0F 01 DA +let Uses = [EAX] in +def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins), + "vmload\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>; +let Uses = [RAX] in +def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins), + "vmload\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>; + +// 0F 01 DB +let Uses = [EAX] in +def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins), + "vmsave\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>; +let Uses = [RAX] in +def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins), + "vmsave\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>; + +// 0F 01 DF +let Uses = [EAX, ECX] in +def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins), + "invlpga\t{%ecx, %eax|EAX, ECX}", []>, TB, Requires<[In32BitMode]>; +let Uses = [RAX, ECX] in +def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins), + "invlpga\t{%ecx, %rax|RAX, ECX}", []>, TB, Requires<[In64BitMode]>; + diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td new file mode 100644 index 0000000..89c1a68 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td @@ -0,0 +1,978 @@ +//===-- X86InstrShiftRotate.td - Shift and Rotate Instrs ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the shift and rotate instructions. +// +//===----------------------------------------------------------------------===// + +// FIXME: Someone needs to smear multipattern goodness all over this file. + +let Defs = [EFLAGS] in { + +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { +let Uses = [CL] in { +def SHL8rCL : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1), + "shl{b}\t{%cl, $dst|$dst, CL}", + [(set GR8:$dst, (shl GR8:$src1, CL))], IIC_SR>; +def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1), + "shl{w}\t{%cl, $dst|$dst, CL}", + [(set GR16:$dst, (shl GR16:$src1, CL))], IIC_SR>, OpSize; +def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1), + "shl{l}\t{%cl, $dst|$dst, CL}", + [(set GR32:$dst, (shl GR32:$src1, CL))], IIC_SR>; +def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1), + "shl{q}\t{%cl, $dst|$dst, CL}", + [(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>; +} // Uses = [CL] + +def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), + "shl{b}\t{$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))], IIC_SR>; + +let isConvertibleToThreeAddress = 1 in { // Can transform into LEA. +def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), + "shl{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))], IIC_SR>, + OpSize; +def SHL32ri : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), + "shl{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))], IIC_SR>; +def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst), + (ins GR64:$src1, i8imm:$src2), + "shl{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))], + IIC_SR>; + +// NOTE: We don't include patterns for shifts of a register by one, because +// 'add reg,reg' is cheaper (and we have a Pat pattern for shift-by-one). +let hasSideEffects = 0 in { +def SHL8r1 : I<0xD0, MRM4r, (outs GR8:$dst), (ins GR8:$src1), + "shl{b}\t$dst", [], IIC_SR>; +def SHL16r1 : I<0xD1, MRM4r, (outs GR16:$dst), (ins GR16:$src1), + "shl{w}\t$dst", [], IIC_SR>, OpSize; +def SHL32r1 : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1), + "shl{l}\t$dst", [], IIC_SR>; +def SHL64r1 : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1), + "shl{q}\t$dst", [], IIC_SR>; +} // hasSideEffects = 0 +} // isConvertibleToThreeAddress = 1 +} // Constraints = "$src = $dst", SchedRW + + +let SchedRW = [WriteShiftLd, WriteRMW] in { +// FIXME: Why do we need an explicit "Uses = [CL]" when the instr has a pattern +// using CL? +let Uses = [CL] in { +def SHL8mCL : I<0xD2, MRM4m, (outs), (ins i8mem :$dst), + "shl{b}\t{%cl, $dst|$dst, CL}", + [(store (shl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>; +def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst), + "shl{w}\t{%cl, $dst|$dst, CL}", + [(store (shl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, + OpSize; +def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst), + "shl{l}\t{%cl, $dst|$dst, CL}", + [(store (shl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>; +def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst), + "shl{q}\t{%cl, $dst|$dst, CL}", + [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; +} +def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, i8imm:$src), + "shl{b}\t{$src, $dst|$dst, $src}", + [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; +def SHL16mi : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, i8imm:$src), + "shl{w}\t{$src, $dst|$dst, $src}", + [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>, + OpSize; +def SHL32mi : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, i8imm:$src), + "shl{l}\t{$src, $dst|$dst, $src}", + [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; +def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, i8imm:$src), + "shl{q}\t{$src, $dst|$dst, $src}", + [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; + +// Shift by 1 +def SHL8m1 : I<0xD0, MRM4m, (outs), (ins i8mem :$dst), + "shl{b}\t$dst", + [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +def SHL16m1 : I<0xD1, MRM4m, (outs), (ins i16mem:$dst), + "shl{w}\t$dst", + [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>, + OpSize; +def SHL32m1 : I<0xD1, MRM4m, (outs), (ins i32mem:$dst), + "shl{l}\t$dst", + [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst), + "shl{q}\t$dst", + [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +} // SchedRW + +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { +let Uses = [CL] in { +def SHR8rCL : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1), + "shr{b}\t{%cl, $dst|$dst, CL}", + [(set GR8:$dst, (srl GR8:$src1, CL))], IIC_SR>; +def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1), + "shr{w}\t{%cl, $dst|$dst, CL}", + [(set GR16:$dst, (srl GR16:$src1, CL))], IIC_SR>, OpSize; +def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1), + "shr{l}\t{%cl, $dst|$dst, CL}", + [(set GR32:$dst, (srl GR32:$src1, CL))], IIC_SR>; +def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1), + "shr{q}\t{%cl, $dst|$dst, CL}", + [(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>; +} + +def SHR8ri : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2), + "shr{b}\t{$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))], IIC_SR>; +def SHR16ri : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), + "shr{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))], + IIC_SR>, OpSize; +def SHR32ri : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), + "shr{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))], + IIC_SR>; +def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2), + "shr{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))], IIC_SR>; + +// Shift right by 1 +def SHR8r1 : I<0xD0, MRM5r, (outs GR8:$dst), (ins GR8:$src1), + "shr{b}\t$dst", + [(set GR8:$dst, (srl GR8:$src1, (i8 1)))], IIC_SR>; +def SHR16r1 : I<0xD1, MRM5r, (outs GR16:$dst), (ins GR16:$src1), + "shr{w}\t$dst", + [(set GR16:$dst, (srl GR16:$src1, (i8 1)))], IIC_SR>, OpSize; +def SHR32r1 : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1), + "shr{l}\t$dst", + [(set GR32:$dst, (srl GR32:$src1, (i8 1)))], IIC_SR>; +def SHR64r1 : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1), + "shr{q}\t$dst", + [(set GR64:$dst, (srl GR64:$src1, (i8 1)))], IIC_SR>; +} // Constraints = "$src = $dst", SchedRW + + +let SchedRW = [WriteShiftLd, WriteRMW] in { +let Uses = [CL] in { +def SHR8mCL : I<0xD2, MRM5m, (outs), (ins i8mem :$dst), + "shr{b}\t{%cl, $dst|$dst, CL}", + [(store (srl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>; +def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst), + "shr{w}\t{%cl, $dst|$dst, CL}", + [(store (srl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, + OpSize; +def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst), + "shr{l}\t{%cl, $dst|$dst, CL}", + [(store (srl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>; +def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst), + "shr{q}\t{%cl, $dst|$dst, CL}", + [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; +} +def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src), + "shr{b}\t{$src, $dst|$dst, $src}", + [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; +def SHR16mi : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, i8imm:$src), + "shr{w}\t{$src, $dst|$dst, $src}", + [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>, + OpSize; +def SHR32mi : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, i8imm:$src), + "shr{l}\t{$src, $dst|$dst, $src}", + [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; +def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, i8imm:$src), + "shr{q}\t{$src, $dst|$dst, $src}", + [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; + +// Shift by 1 +def SHR8m1 : I<0xD0, MRM5m, (outs), (ins i8mem :$dst), + "shr{b}\t$dst", + [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +def SHR16m1 : I<0xD1, MRM5m, (outs), (ins i16mem:$dst), + "shr{w}\t$dst", + [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>,OpSize; +def SHR32m1 : I<0xD1, MRM5m, (outs), (ins i32mem:$dst), + "shr{l}\t$dst", + [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst), + "shr{q}\t$dst", + [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +} // SchedRW + +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { +let Uses = [CL] in { +def SAR8rCL : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1), + "sar{b}\t{%cl, $dst|$dst, CL}", + [(set GR8:$dst, (sra GR8:$src1, CL))], + IIC_SR>; +def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1), + "sar{w}\t{%cl, $dst|$dst, CL}", + [(set GR16:$dst, (sra GR16:$src1, CL))], + IIC_SR>, OpSize; +def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1), + "sar{l}\t{%cl, $dst|$dst, CL}", + [(set GR32:$dst, (sra GR32:$src1, CL))], + IIC_SR>; +def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1), + "sar{q}\t{%cl, $dst|$dst, CL}", + [(set GR64:$dst, (sra GR64:$src1, CL))], + IIC_SR>; +} + +def SAR8ri : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), + "sar{b}\t{$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))], + IIC_SR>; +def SAR16ri : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), + "sar{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))], + IIC_SR>, + OpSize; +def SAR32ri : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), + "sar{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))], + IIC_SR>; +def SAR64ri : RIi8<0xC1, MRM7r, (outs GR64:$dst), + (ins GR64:$src1, i8imm:$src2), + "sar{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))], + IIC_SR>; + +// Shift by 1 +def SAR8r1 : I<0xD0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1), + "sar{b}\t$dst", + [(set GR8:$dst, (sra GR8:$src1, (i8 1)))], + IIC_SR>; +def SAR16r1 : I<0xD1, MRM7r, (outs GR16:$dst), (ins GR16:$src1), + "sar{w}\t$dst", + [(set GR16:$dst, (sra GR16:$src1, (i8 1)))], + IIC_SR>, OpSize; +def SAR32r1 : I<0xD1, MRM7r, (outs GR32:$dst), (ins GR32:$src1), + "sar{l}\t$dst", + [(set GR32:$dst, (sra GR32:$src1, (i8 1)))], + IIC_SR>; +def SAR64r1 : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1), + "sar{q}\t$dst", + [(set GR64:$dst, (sra GR64:$src1, (i8 1)))], + IIC_SR>; +} // Constraints = "$src = $dst", SchedRW + + +let SchedRW = [WriteShiftLd, WriteRMW] in { +let Uses = [CL] in { +def SAR8mCL : I<0xD2, MRM7m, (outs), (ins i8mem :$dst), + "sar{b}\t{%cl, $dst|$dst, CL}", + [(store (sra (loadi8 addr:$dst), CL), addr:$dst)], + IIC_SR>; +def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst), + "sar{w}\t{%cl, $dst|$dst, CL}", + [(store (sra (loadi16 addr:$dst), CL), addr:$dst)], + IIC_SR>, OpSize; +def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst), + "sar{l}\t{%cl, $dst|$dst, CL}", + [(store (sra (loadi32 addr:$dst), CL), addr:$dst)], + IIC_SR>; +def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), + "sar{q}\t{%cl, $dst|$dst, CL}", + [(store (sra (loadi64 addr:$dst), CL), addr:$dst)], + IIC_SR>; +} +def SAR8mi : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, i8imm:$src), + "sar{b}\t{$src, $dst|$dst, $src}", + [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; +def SAR16mi : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, i8imm:$src), + "sar{w}\t{$src, $dst|$dst, $src}", + [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>, + OpSize; +def SAR32mi : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, i8imm:$src), + "sar{l}\t{$src, $dst|$dst, $src}", + [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; +def SAR64mi : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, i8imm:$src), + "sar{q}\t{$src, $dst|$dst, $src}", + [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; + +// Shift by 1 +def SAR8m1 : I<0xD0, MRM7m, (outs), (ins i8mem :$dst), + "sar{b}\t$dst", + [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +def SAR16m1 : I<0xD1, MRM7m, (outs), (ins i16mem:$dst), + "sar{w}\t$dst", + [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>, + OpSize; +def SAR32m1 : I<0xD1, MRM7m, (outs), (ins i32mem:$dst), + "sar{l}\t$dst", + [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst), + "sar{q}\t$dst", + [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Rotate instructions +//===----------------------------------------------------------------------===// + +let hasSideEffects = 0 in { +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { +def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1), + "rcl{b}\t$dst", [], IIC_SR>; +def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt), + "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; +let Uses = [CL] in +def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1), + "rcl{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + +def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1), + "rcl{w}\t$dst", [], IIC_SR>, OpSize; +def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt), + "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize; +let Uses = [CL] in +def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1), + "rcl{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize; + +def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1), + "rcl{l}\t$dst", [], IIC_SR>; +def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt), + "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; +let Uses = [CL] in +def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1), + "rcl{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + + +def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1), + "rcl{q}\t$dst", [], IIC_SR>; +def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt), + "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; +let Uses = [CL] in +def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1), + "rcl{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + + +def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1), + "rcr{b}\t$dst", [], IIC_SR>; +def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt), + "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; +let Uses = [CL] in +def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1), + "rcr{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + +def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1), + "rcr{w}\t$dst", [], IIC_SR>, OpSize; +def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt), + "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize; +let Uses = [CL] in +def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1), + "rcr{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize; + +def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1), + "rcr{l}\t$dst", [], IIC_SR>; +def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt), + "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; +let Uses = [CL] in +def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1), + "rcr{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + +def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1), + "rcr{q}\t$dst", [], IIC_SR>; +def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt), + "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; +let Uses = [CL] in +def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1), + "rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + +} // Constraints = "$src = $dst" + +let SchedRW = [WriteShiftLd, WriteRMW] in { +def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst), + "rcl{b}\t$dst", [], IIC_SR>; +def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, i8imm:$cnt), + "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; +def RCL16m1 : I<0xD1, MRM2m, (outs), (ins i16mem:$dst), + "rcl{w}\t$dst", [], IIC_SR>, OpSize; +def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, i8imm:$cnt), + "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize; +def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst), + "rcl{l}\t$dst", [], IIC_SR>; +def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, i8imm:$cnt), + "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; +def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst), + "rcl{q}\t$dst", [], IIC_SR>; +def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, i8imm:$cnt), + "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; + +def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst), + "rcr{b}\t$dst", [], IIC_SR>; +def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, i8imm:$cnt), + "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; +def RCR16m1 : I<0xD1, MRM3m, (outs), (ins i16mem:$dst), + "rcr{w}\t$dst", [], IIC_SR>, OpSize; +def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, i8imm:$cnt), + "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize; +def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst), + "rcr{l}\t$dst", [], IIC_SR>; +def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, i8imm:$cnt), + "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; +def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst), + "rcr{q}\t$dst", [], IIC_SR>; +def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, i8imm:$cnt), + "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; + +let Uses = [CL] in { +def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst), + "rcl{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; +def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst), + "rcl{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize; +def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst), + "rcl{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; +def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst), + "rcl{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + +def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst), + "rcr{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; +def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst), + "rcr{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize; +def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst), + "rcr{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; +def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst), + "rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; +} +} // SchedRW +} // hasSideEffects = 0 + +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { +// FIXME: provide shorter instructions when imm8 == 1 +let Uses = [CL] in { +def ROL8rCL : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), + "rol{b}\t{%cl, $dst|$dst, CL}", + [(set GR8:$dst, (rotl GR8:$src1, CL))], IIC_SR>; +def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1), + "rol{w}\t{%cl, $dst|$dst, CL}", + [(set GR16:$dst, (rotl GR16:$src1, CL))], IIC_SR>, OpSize; +def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1), + "rol{l}\t{%cl, $dst|$dst, CL}", + [(set GR32:$dst, (rotl GR32:$src1, CL))], IIC_SR>; +def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1), + "rol{q}\t{%cl, $dst|$dst, CL}", + [(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>; +} + +def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), + "rol{b}\t{$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))], IIC_SR>; +def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), + "rol{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))], + IIC_SR>, + OpSize; +def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), + "rol{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))], + IIC_SR>; +def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst), + (ins GR64:$src1, i8imm:$src2), + "rol{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))], + IIC_SR>; + +// Rotate by 1 +def ROL8r1 : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), + "rol{b}\t$dst", + [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))], + IIC_SR>; +def ROL16r1 : I<0xD1, MRM0r, (outs GR16:$dst), (ins GR16:$src1), + "rol{w}\t$dst", + [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))], + IIC_SR>, OpSize; +def ROL32r1 : I<0xD1, MRM0r, (outs GR32:$dst), (ins GR32:$src1), + "rol{l}\t$dst", + [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))], + IIC_SR>; +def ROL64r1 : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1), + "rol{q}\t$dst", + [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))], + IIC_SR>; +} // Constraints = "$src = $dst", SchedRW + +let SchedRW = [WriteShiftLd, WriteRMW] in { +let Uses = [CL] in { +def ROL8mCL : I<0xD2, MRM0m, (outs), (ins i8mem :$dst), + "rol{b}\t{%cl, $dst|$dst, CL}", + [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)], + IIC_SR>; +def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst), + "rol{w}\t{%cl, $dst|$dst, CL}", + [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)], + IIC_SR>, OpSize; +def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst), + "rol{l}\t{%cl, $dst|$dst, CL}", + [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)], + IIC_SR>; +def ROL64mCL : RI<0xD3, MRM0m, (outs), (ins i64mem:$dst), + "rol{q}\t{%cl, $dst|$dst, %cl}", + [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)], + IIC_SR>; +} +def ROL8mi : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, i8imm:$src1), + "rol{b}\t{$src1, $dst|$dst, $src1}", + [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)], + IIC_SR>; +def ROL16mi : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, i8imm:$src1), + "rol{w}\t{$src1, $dst|$dst, $src1}", + [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)], + IIC_SR>, + OpSize; +def ROL32mi : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, i8imm:$src1), + "rol{l}\t{$src1, $dst|$dst, $src1}", + [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)], + IIC_SR>; +def ROL64mi : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, i8imm:$src1), + "rol{q}\t{$src1, $dst|$dst, $src1}", + [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)], + IIC_SR>; + +// Rotate by 1 +def ROL8m1 : I<0xD0, MRM0m, (outs), (ins i8mem :$dst), + "rol{b}\t$dst", + [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +def ROL16m1 : I<0xD1, MRM0m, (outs), (ins i16mem:$dst), + "rol{w}\t$dst", + [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>, + OpSize; +def ROL32m1 : I<0xD1, MRM0m, (outs), (ins i32mem:$dst), + "rol{l}\t$dst", + [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +def ROL64m1 : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst), + "rol{q}\t$dst", + [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +} // SchedRW + +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { +let Uses = [CL] in { +def ROR8rCL : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), + "ror{b}\t{%cl, $dst|$dst, CL}", + [(set GR8:$dst, (rotr GR8:$src1, CL))], IIC_SR>; +def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1), + "ror{w}\t{%cl, $dst|$dst, CL}", + [(set GR16:$dst, (rotr GR16:$src1, CL))], IIC_SR>, OpSize; +def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1), + "ror{l}\t{%cl, $dst|$dst, CL}", + [(set GR32:$dst, (rotr GR32:$src1, CL))], IIC_SR>; +def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1), + "ror{q}\t{%cl, $dst|$dst, CL}", + [(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>; +} + +def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), + "ror{b}\t{$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))], IIC_SR>; +def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), + "ror{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))], + IIC_SR>, + OpSize; +def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), + "ror{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))], + IIC_SR>; +def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst), + (ins GR64:$src1, i8imm:$src2), + "ror{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))], + IIC_SR>; + +// Rotate by 1 +def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), + "ror{b}\t$dst", + [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))], + IIC_SR>; +def ROR16r1 : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1), + "ror{w}\t$dst", + [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))], + IIC_SR>, OpSize; +def ROR32r1 : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1), + "ror{l}\t$dst", + [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))], + IIC_SR>; +def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1), + "ror{q}\t$dst", + [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))], + IIC_SR>; +} // Constraints = "$src = $dst", SchedRW + +let SchedRW = [WriteShiftLd, WriteRMW] in { +let Uses = [CL] in { +def ROR8mCL : I<0xD2, MRM1m, (outs), (ins i8mem :$dst), + "ror{b}\t{%cl, $dst|$dst, CL}", + [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)], + IIC_SR>; +def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst), + "ror{w}\t{%cl, $dst|$dst, CL}", + [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)], + IIC_SR>, OpSize; +def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst), + "ror{l}\t{%cl, $dst|$dst, CL}", + [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)], + IIC_SR>; +def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), + "ror{q}\t{%cl, $dst|$dst, CL}", + [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)], + IIC_SR>; +} +def ROR8mi : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, i8imm:$src), + "ror{b}\t{$src, $dst|$dst, $src}", + [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; +def ROR16mi : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, i8imm:$src), + "ror{w}\t{$src, $dst|$dst, $src}", + [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>, + OpSize; +def ROR32mi : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, i8imm:$src), + "ror{l}\t{$src, $dst|$dst, $src}", + [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; +def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, i8imm:$src), + "ror{q}\t{$src, $dst|$dst, $src}", + [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; + +// Rotate by 1 +def ROR8m1 : I<0xD0, MRM1m, (outs), (ins i8mem :$dst), + "ror{b}\t$dst", + [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +def ROR16m1 : I<0xD1, MRM1m, (outs), (ins i16mem:$dst), + "ror{w}\t$dst", + [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>, + OpSize; +def ROR32m1 : I<0xD1, MRM1m, (outs), (ins i32mem:$dst), + "ror{l}\t$dst", + [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst), + "ror{q}\t$dst", + [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +} // SchedRW + + +//===----------------------------------------------------------------------===// +// Double shift instructions (generalizations of rotate) +//===----------------------------------------------------------------------===// + +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { + +let Uses = [CL] in { +def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst), + (ins GR16:$src1, GR16:$src2), + "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", + [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))], + IIC_SHD16_REG_CL>, + TB, OpSize; +def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst), + (ins GR16:$src1, GR16:$src2), + "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", + [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))], + IIC_SHD16_REG_CL>, + TB, OpSize; +def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst), + (ins GR32:$src1, GR32:$src2), + "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", + [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))], + IIC_SHD32_REG_CL>, TB; +def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst), + (ins GR32:$src1, GR32:$src2), + "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", + [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))], + IIC_SHD32_REG_CL>, TB; +def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2), + "shld{q}\t{%cl, $src2, $dst|$dst, $src2, CL}", + [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))], + IIC_SHD64_REG_CL>, + TB; +def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2), + "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, CL}", + [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))], + IIC_SHD64_REG_CL>, + TB; +} + +let isCommutable = 1 in { // These instructions commute to each other. +def SHLD16rri8 : Ii8<0xA4, MRMDestReg, + (outs GR16:$dst), + (ins GR16:$src1, GR16:$src2, i8imm:$src3), + "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, + (i8 imm:$src3)))], IIC_SHD16_REG_IM>, + TB, OpSize; +def SHRD16rri8 : Ii8<0xAC, MRMDestReg, + (outs GR16:$dst), + (ins GR16:$src1, GR16:$src2, i8imm:$src3), + "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, + (i8 imm:$src3)))], IIC_SHD16_REG_IM>, + TB, OpSize; +def SHLD32rri8 : Ii8<0xA4, MRMDestReg, + (outs GR32:$dst), + (ins GR32:$src1, GR32:$src2, i8imm:$src3), + "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, + (i8 imm:$src3)))], IIC_SHD32_REG_IM>, + TB; +def SHRD32rri8 : Ii8<0xAC, MRMDestReg, + (outs GR32:$dst), + (ins GR32:$src1, GR32:$src2, i8imm:$src3), + "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, + (i8 imm:$src3)))], IIC_SHD32_REG_IM>, + TB; +def SHLD64rri8 : RIi8<0xA4, MRMDestReg, + (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2, i8imm:$src3), + "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, + (i8 imm:$src3)))], IIC_SHD64_REG_IM>, + TB; +def SHRD64rri8 : RIi8<0xAC, MRMDestReg, + (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2, i8imm:$src3), + "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, + (i8 imm:$src3)))], IIC_SHD64_REG_IM>, + TB; +} +} // Constraints = "$src = $dst", SchedRW + +let SchedRW = [WriteShiftLd, WriteRMW] in { +let Uses = [CL] in { +def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), + "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", + [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL), + addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize; +def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), + "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", + [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL), + addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize; + +def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), + "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", + [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL), + addr:$dst)], IIC_SHD32_MEM_CL>, TB; +def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), + "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", + [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL), + addr:$dst)], IIC_SHD32_MEM_CL>, TB; + +def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), + "shld{q}\t{%cl, $src2, $dst|$dst, $src2, CL}", + [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL), + addr:$dst)], IIC_SHD64_MEM_CL>, TB; +def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), + "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, CL}", + [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL), + addr:$dst)], IIC_SHD64_MEM_CL>, TB; +} + +def SHLD16mri8 : Ii8<0xA4, MRMDestMem, + (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3), + "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shld (loadi16 addr:$dst), GR16:$src2, + (i8 imm:$src3)), addr:$dst)], + IIC_SHD16_MEM_IM>, + TB, OpSize; +def SHRD16mri8 : Ii8<0xAC, MRMDestMem, + (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3), + "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, + (i8 imm:$src3)), addr:$dst)], + IIC_SHD16_MEM_IM>, + TB, OpSize; + +def SHLD32mri8 : Ii8<0xA4, MRMDestMem, + (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3), + "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shld (loadi32 addr:$dst), GR32:$src2, + (i8 imm:$src3)), addr:$dst)], + IIC_SHD32_MEM_IM>, + TB; +def SHRD32mri8 : Ii8<0xAC, MRMDestMem, + (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3), + "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, + (i8 imm:$src3)), addr:$dst)], + IIC_SHD32_MEM_IM>, + TB; + +def SHLD64mri8 : RIi8<0xA4, MRMDestMem, + (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3), + "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shld (loadi64 addr:$dst), GR64:$src2, + (i8 imm:$src3)), addr:$dst)], + IIC_SHD64_MEM_IM>, + TB; +def SHRD64mri8 : RIi8<0xAC, MRMDestMem, + (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3), + "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, + (i8 imm:$src3)), addr:$dst)], + IIC_SHD64_MEM_IM>, + TB; +} // SchedRW + +} // Defs = [EFLAGS] + +def ROT32L2R_imm8 : SDNodeXForm<imm, [{ + // Convert a ROTL shamt to a ROTR shamt on 32-bit integer. + return getI8Imm(32 - N->getZExtValue()); +}]>; + +def ROT64L2R_imm8 : SDNodeXForm<imm, [{ + // Convert a ROTL shamt to a ROTR shamt on 64-bit integer. + return getI8Imm(64 - N->getZExtValue()); +}]>; + +multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> { +let neverHasSideEffects = 1 in { + def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, TAXD, VEX, Sched<[WriteShift]>; + let mayLoad = 1 in + def mi : Ii8<0xF0, MRMSrcMem, (outs RC:$dst), + (ins x86memop:$src1, i8imm:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, TAXD, VEX, Sched<[WriteShiftLd]>; +} +} + +multiclass bmi_shift<string asm, RegisterClass RC, X86MemOperand x86memop> { +let neverHasSideEffects = 1 in { + def rr : I<0xF7, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, + VEX_4VOp3, Sched<[WriteShift]>; + let mayLoad = 1 in + def rm : I<0xF7, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, RC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, + VEX_4VOp3, + Sched<[WriteShiftLd, + // x86memop:$src1 + ReadDefault, ReadDefault, ReadDefault, ReadDefault, + ReadDefault, + // RC:$src1 + ReadAfterLd]>; +} +} + +let Predicates = [HasBMI2] in { + defm RORX32 : bmi_rotate<"rorx{l}", GR32, i32mem>; + defm RORX64 : bmi_rotate<"rorx{q}", GR64, i64mem>, VEX_W; + defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem>, T8XS; + defm SARX64 : bmi_shift<"sarx{q}", GR64, i64mem>, T8XS, VEX_W; + defm SHRX32 : bmi_shift<"shrx{l}", GR32, i32mem>, T8XD; + defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem>, T8XD, VEX_W; + defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem>, T8, OpSize; + defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8, OpSize, VEX_W; + + // Prefer RORX which is non-destructive and doesn't update EFLAGS. + let AddedComplexity = 10 in { + def : Pat<(rotl GR32:$src, (i8 imm:$shamt)), + (RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>; + def : Pat<(rotl GR64:$src, (i8 imm:$shamt)), + (RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>; + } + + def : Pat<(rotl (loadi32 addr:$src), (i8 imm:$shamt)), + (RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>; + def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)), + (RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>; + + // Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not + // immedidate shift, i.e. the following code is considered better + // + // mov %edi, %esi + // shl $imm, %esi + // ... %edi, ... + // + // than + // + // movb $imm, %sil + // shlx %sil, %edi, %esi + // ... %edi, ... + // + let AddedComplexity = 1 in { + def : Pat<(sra GR32:$src1, GR8:$src2), + (SARX32rr GR32:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(sra GR64:$src1, GR8:$src2), + (SARX64rr GR64:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + + def : Pat<(srl GR32:$src1, GR8:$src2), + (SHRX32rr GR32:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(srl GR64:$src1, GR8:$src2), + (SHRX64rr GR64:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + + def : Pat<(shl GR32:$src1, GR8:$src2), + (SHLX32rr GR32:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(shl GR64:$src1, GR8:$src2), + (SHLX64rr GR64:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + } + + // Patterns on SARXrm/SHRXrm/SHLXrm are explicitly omitted to favor + // + // mov (%ecx), %esi + // shl $imm, $esi + // + // over + // + // movb $imm %al + // shlx %al, (%ecx), %esi + // + // As SARXrr/SHRXrr/SHLXrr is favored on variable shift, the peephole + // optimization will fold them into SARXrm/SHRXrm/SHLXrm if possible. +} diff --git a/contrib/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm/lib/Target/X86/X86InstrSystem.td new file mode 100644 index 0000000..bab3cdd --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrSystem.td @@ -0,0 +1,529 @@ +//===-- X86InstrSystem.td - System Instructions ------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 instructions that are generally used in +// privileged modes. These are not typically used by the compiler, but are +// supported for the assembler and disassembler. +// +//===----------------------------------------------------------------------===// + +let SchedRW = [WriteSystem] in { +let Defs = [RAX, RDX] in + def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)], IIC_RDTSC>, + TB; + +let Defs = [RAX, RCX, RDX] in + def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", []>, TB; + +// CPU flow control instructions + +let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in { + def TRAP : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB; + def UD2B : I<0xB9, RawFrm, (outs), (ins), "ud2b", []>, TB; +} + +def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", [], IIC_HLT>; +def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", [], IIC_RSM>, TB; + +// Interrupt and SysCall Instructions. +let Uses = [EFLAGS] in + def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>; +def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", + [(int_x86_int (i8 3))], IIC_INT3>; +} // SchedRW + +def : Pat<(debugtrap), + (INT3)>; + +// The long form of "int $3" turns into int3 as a size optimization. +// FIXME: This doesn't work because InstAlias can't match immediate constants. +//def : InstAlias<"int\t$3", (INT3)>; + +let SchedRW = [WriteSystem] in { + +def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap", + [(int_x86_int imm:$trap)], IIC_INT>; + + +def SYSCALL : I<0x05, RawFrm, (outs), (ins), "syscall", [], IIC_SYSCALL>, TB; +def SYSRET : I<0x07, RawFrm, (outs), (ins), "sysret{l}", [], IIC_SYSCALL>, TB; +def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysret{q}", [], IIC_SYSCALL>, TB, + Requires<[In64BitMode]>; + +def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", [], + IIC_SYS_ENTER_EXIT>, TB; + +def SYSEXIT : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", [], + IIC_SYS_ENTER_EXIT>, TB; +def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", []>, TB, + Requires<[In64BitMode]>; + +def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>, OpSize; +def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", [], IIC_IRET>; +def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", [], IIC_IRET>, + Requires<[In64BitMode]>; +} // SchedRW + + +//===----------------------------------------------------------------------===// +// Input/Output Instructions. +// +let SchedRW = [WriteSystem] in { +let Defs = [AL], Uses = [DX] in +def IN8rr : I<0xEC, RawFrm, (outs), (ins), + "in{b}\t{%dx, %al|AL, DX}", [], IIC_IN_RR>; +let Defs = [AX], Uses = [DX] in +def IN16rr : I<0xED, RawFrm, (outs), (ins), + "in{w}\t{%dx, %ax|AX, DX}", [], IIC_IN_RR>, OpSize; +let Defs = [EAX], Uses = [DX] in +def IN32rr : I<0xED, RawFrm, (outs), (ins), + "in{l}\t{%dx, %eax|EAX, DX}", [], IIC_IN_RR>; + +let Defs = [AL] in +def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins i8imm:$port), + "in{b}\t{$port, %al|AL, $port}", [], IIC_IN_RI>; +let Defs = [AX] in +def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port), + "in{w}\t{$port, %ax|AX, $port}", [], IIC_IN_RI>, OpSize; +let Defs = [EAX] in +def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port), + "in{l}\t{$port, %eax|EAX, $port}", [], IIC_IN_RI>; + +let Uses = [DX, AL] in +def OUT8rr : I<0xEE, RawFrm, (outs), (ins), + "out{b}\t{%al, %dx|DX, AL}", [], IIC_OUT_RR>; +let Uses = [DX, AX] in +def OUT16rr : I<0xEF, RawFrm, (outs), (ins), + "out{w}\t{%ax, %dx|DX, AX}", [], IIC_OUT_RR>, OpSize; +let Uses = [DX, EAX] in +def OUT32rr : I<0xEF, RawFrm, (outs), (ins), + "out{l}\t{%eax, %dx|DX, EAX}", [], IIC_OUT_RR>; + +let Uses = [AL] in +def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins i8imm:$port), + "out{b}\t{%al, $port|$port, AL}", [], IIC_OUT_IR>; +let Uses = [AX] in +def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port), + "out{w}\t{%ax, $port|$port, AX}", [], IIC_OUT_IR>, OpSize; +let Uses = [EAX] in +def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port), + "out{l}\t{%eax, $port|$port, EAX}", [], IIC_OUT_IR>; + +def IN8 : I<0x6C, RawFrm, (outs), (ins), "ins{b}", [], IIC_INS>; +def IN16 : I<0x6D, RawFrm, (outs), (ins), "ins{w}", [], IIC_INS>, OpSize; +def IN32 : I<0x6D, RawFrm, (outs), (ins), "ins{l}", [], IIC_INS>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Moves to and from debug registers + +let SchedRW = [WriteSystem] in { +def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src), + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB; +def MOV64rd : I<0x21, MRMDestReg, (outs GR64:$dst), (ins DEBUG_REG:$src), + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB; + +def MOV32dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB; +def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Moves to and from control registers + +let SchedRW = [WriteSystem] in { +def MOV32rc : I<0x20, MRMDestReg, (outs GR32:$dst), (ins CONTROL_REG:$src), + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB; +def MOV64rc : I<0x20, MRMDestReg, (outs GR64:$dst), (ins CONTROL_REG:$src), + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB; + +def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB; +def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Segment override instruction prefixes + +def CS_PREFIX : I<0x2E, RawFrm, (outs), (ins), "cs", []>; +def SS_PREFIX : I<0x36, RawFrm, (outs), (ins), "ss", []>; +def DS_PREFIX : I<0x3E, RawFrm, (outs), (ins), "ds", []>; +def ES_PREFIX : I<0x26, RawFrm, (outs), (ins), "es", []>; +def FS_PREFIX : I<0x64, RawFrm, (outs), (ins), "fs", []>; +def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>; + + +//===----------------------------------------------------------------------===// +// Moves to and from segment registers. +// + +let SchedRW = [WriteMove] in { +def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src), + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize; +def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src), + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>; +def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src), + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>; + +def MOV16ms : I<0x8C, MRMDestMem, (outs i16mem:$dst), (ins SEGMENT_REG:$src), + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize; +def MOV32ms : I<0x8C, MRMDestMem, (outs i32mem:$dst), (ins SEGMENT_REG:$src), + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>; +def MOV64ms : RI<0x8C, MRMDestMem, (outs i64mem:$dst), (ins SEGMENT_REG:$src), + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>; + +def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src), + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize; +def MOV32sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>; +def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>; + +def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src), + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize; +def MOV32sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i32mem:$src), + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>; +def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src), + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Segmentation support instructions. + +let SchedRW = [WriteSystem] in { +def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB; + +def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB, OpSize; +def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB, OpSize; + +// i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo. +def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), + "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB; +def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB; +// i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo. +def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), + "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB; +def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src), + "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB; + +def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB, OpSize; +def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB, OpSize; +def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB; +def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB; +def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB; +def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB; + +def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr", + [], IIC_INVLPG>, TB; + +def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins), + "str{w}\t$dst", [], IIC_STR>, TB, OpSize; +def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins), + "str{l}\t$dst", [], IIC_STR>, TB; +def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins), + "str{q}\t$dst", [], IIC_STR>, TB; +def STRm : I<0x00, MRM1m, (outs i16mem:$dst), (ins), + "str{w}\t$dst", [], IIC_STR>, TB; + +def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src), + "ltr{w}\t$src", [], IIC_LTR>, TB; +def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src), + "ltr{w}\t$src", [], IIC_LTR>, TB; + +def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins), + "push{w}\t{%cs|CS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, + OpSize; +def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins), + "push{l}\t{%cs|CS}", [], IIC_PUSH_CS>, Requires<[In32BitMode]>; +def PUSHSS16 : I<0x16, RawFrm, (outs), (ins), + "push{w}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, + OpSize; +def PUSHSS32 : I<0x16, RawFrm, (outs), (ins), + "push{l}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; +def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins), + "push{w}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, + OpSize; +def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins), + "push{l}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; +def PUSHES16 : I<0x06, RawFrm, (outs), (ins), + "push{w}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, + OpSize; +def PUSHES32 : I<0x06, RawFrm, (outs), (ins), + "push{l}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; + +def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins), + "push{w}\t{%fs|FS}", [], IIC_PUSH_SR>, OpSize, TB; +def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins), + "push{l}\t{%fs|FS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>; +def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins), + "push{w}\t{%gs|GS}", [], IIC_PUSH_SR>, OpSize, TB; +def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins), + "push{l}\t{%gs|GS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>; + +def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins), + "push{q}\t{%fs|FS}", [], IIC_PUSH_SR>, TB; +def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins), + "push{q}\t{%gs|GS}", [], IIC_PUSH_SR>, TB; + +// No "pop cs" instruction. +def POPSS16 : I<0x17, RawFrm, (outs), (ins), + "pop{w}\t{%ss|SS}", [], IIC_POP_SR_SS>, + OpSize, Requires<[In32BitMode]>; +def POPSS32 : I<0x17, RawFrm, (outs), (ins), + "pop{l}\t{%ss|SS}", [], IIC_POP_SR_SS>, + Requires<[In32BitMode]>; + +def POPDS16 : I<0x1F, RawFrm, (outs), (ins), + "pop{w}\t{%ds|DS}", [], IIC_POP_SR>, + OpSize, Requires<[In32BitMode]>; +def POPDS32 : I<0x1F, RawFrm, (outs), (ins), + "pop{l}\t{%ds|DS}", [], IIC_POP_SR>, + Requires<[In32BitMode]>; + +def POPES16 : I<0x07, RawFrm, (outs), (ins), + "pop{w}\t{%es|ES}", [], IIC_POP_SR>, + OpSize, Requires<[In32BitMode]>; +def POPES32 : I<0x07, RawFrm, (outs), (ins), + "pop{l}\t{%es|ES}", [], IIC_POP_SR>, + Requires<[In32BitMode]>; + +def POPFS16 : I<0xa1, RawFrm, (outs), (ins), + "pop{w}\t{%fs|FS}", [], IIC_POP_SR>, OpSize, TB; +def POPFS32 : I<0xa1, RawFrm, (outs), (ins), + "pop{l}\t{%fs|FS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>; +def POPFS64 : I<0xa1, RawFrm, (outs), (ins), + "pop{q}\t{%fs|FS}", [], IIC_POP_SR>, TB; + +def POPGS16 : I<0xa9, RawFrm, (outs), (ins), + "pop{w}\t{%gs|GS}", [], IIC_POP_SR>, OpSize, TB; +def POPGS32 : I<0xa9, RawFrm, (outs), (ins), + "pop{l}\t{%gs|GS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>; +def POPGS64 : I<0xa9, RawFrm, (outs), (ins), + "pop{q}\t{%gs|GS}", [], IIC_POP_SR>, TB; + + +def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), + "lds{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize; +def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), + "lds{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>; + +def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), + "lss{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize; +def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), + "lss{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB; +def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src), + "lss{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB; + +def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), + "les{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize; +def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), + "les{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>; + +def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), + "lfs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize; +def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), + "lfs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB; +def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src), + "lfs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB; + +def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), + "lgs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize; +def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), + "lgs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB; + +def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src), + "lgs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB; + + +def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg), + "verr\t$seg", [], IIC_VERR>, TB; +def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg), + "verr\t$seg", [], IIC_VERR>, TB; +def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg), + "verw\t$seg", [], IIC_VERW_MEM>, TB; +def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg), + "verw\t$seg", [], IIC_VERW_REG>, TB; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Descriptor-table support instructions + +let SchedRW = [WriteSystem] in { +def SGDT16m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins), + "sgdt{w}\t$dst", [], IIC_SGDT>, TB, OpSize, Requires<[In32BitMode]>; +def SGDTm : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins), + "sgdt\t$dst", [], IIC_SGDT>, TB; +def SIDT16m : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins), + "sidt{w}\t$dst", [], IIC_SIDT>, TB, OpSize, Requires<[In32BitMode]>; +def SIDTm : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins), + "sidt\t$dst", []>, TB; +def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins), + "sldt{w}\t$dst", [], IIC_SLDT>, TB, OpSize; +def SLDT16m : I<0x00, MRM0m, (outs i16mem:$dst), (ins), + "sldt{w}\t$dst", [], IIC_SLDT>, TB; +def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins), + "sldt{l}\t$dst", [], IIC_SLDT>, TB; + +// LLDT is not interpreted specially in 64-bit mode because there is no sign +// extension. +def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins), + "sldt{q}\t$dst", [], IIC_SLDT>, TB; +def SLDT64m : RI<0x00, MRM0m, (outs i16mem:$dst), (ins), + "sldt{q}\t$dst", [], IIC_SLDT>, TB; + +def LGDT16m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src), + "lgdt{w}\t$src", [], IIC_LGDT>, TB, OpSize, Requires<[In32BitMode]>; +def LGDTm : I<0x01, MRM2m, (outs), (ins opaque48mem:$src), + "lgdt\t$src", [], IIC_LGDT>, TB; +def LIDT16m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src), + "lidt{w}\t$src", [], IIC_LIDT>, TB, OpSize, Requires<[In32BitMode]>; +def LIDTm : I<0x01, MRM3m, (outs), (ins opaque48mem:$src), + "lidt\t$src", [], IIC_LIDT>, TB; +def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src), + "lldt{w}\t$src", [], IIC_LLDT_REG>, TB; +def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src), + "lldt{w}\t$src", [], IIC_LLDT_MEM>, TB; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Specialized register support +let SchedRW = [WriteSystem] in { +def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB; +def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB; +def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [], IIC_RDPMC>, TB; + +def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins), + "smsw{w}\t$dst", [], IIC_SMSW>, OpSize, TB; +def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins), + "smsw{l}\t$dst", [], IIC_SMSW>, TB; +// no m form encodable; use SMSW16m +def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins), + "smsw{q}\t$dst", [], IIC_SMSW>, TB; + +// For memory operands, there is only a 16-bit form +def SMSW16m : I<0x01, MRM4m, (outs i16mem:$dst), (ins), + "smsw{w}\t$dst", [], IIC_SMSW>, TB; + +def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src), + "lmsw{w}\t$src", [], IIC_LMSW_MEM>, TB; +def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src), + "lmsw{w}\t$src", [], IIC_LMSW_REG>, TB; + +def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Cache instructions +let SchedRW = [WriteSystem] in { +def INVD : I<0x08, RawFrm, (outs), (ins), "invd", [], IIC_INVD>, TB; +def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB; +} // SchedRW + +//===----------------------------------------------------------------------===// +// XSAVE instructions +let SchedRW = [WriteSystem] in { +let Defs = [RDX, RAX], Uses = [RCX] in + def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB; + +let Uses = [RDX, RAX, RCX] in + def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB; + +let Uses = [RDX, RAX] in { + def XSAVE : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins), + "xsave\t$dst", []>, TB; + def XSAVE64 : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins), + "xsave{q|64}\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>; + def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst), + "xrstor\t$dst", []>, TB; + def XRSTOR64 : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst), + "xrstor{q|64}\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>; + def XSAVEOPT : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins), + "xsaveopt\t$dst", []>, TB; + def XSAVEOPT64 : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins), + "xsaveopt{q|64}\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>; +} +} // SchedRW + +//===----------------------------------------------------------------------===// +// VIA PadLock crypto instructions +let Defs = [RAX, RDI], Uses = [RDX, RDI] in + def XSTORE : I<0xc0, RawFrm, (outs), (ins), "xstore", []>, A7; + +def : InstAlias<"xstorerng", (XSTORE)>; + +let Defs = [RSI, RDI], Uses = [RBX, RDX, RSI, RDI] in { + def XCRYPTECB : I<0xc8, RawFrm, (outs), (ins), "xcryptecb", []>, A7; + def XCRYPTCBC : I<0xd0, RawFrm, (outs), (ins), "xcryptcbc", []>, A7; + def XCRYPTCTR : I<0xd8, RawFrm, (outs), (ins), "xcryptctr", []>, A7; + def XCRYPTCFB : I<0xe0, RawFrm, (outs), (ins), "xcryptcfb", []>, A7; + def XCRYPTOFB : I<0xe8, RawFrm, (outs), (ins), "xcryptofb", []>, A7; +} + +let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in { + def XSHA1 : I<0xc8, RawFrm, (outs), (ins), "xsha1", []>, A6; + def XSHA256 : I<0xd0, RawFrm, (outs), (ins), "xsha256", []>, A6; +} +let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in + def MONTMUL : I<0xc0, RawFrm, (outs), (ins), "montmul", []>, A6; + +//===----------------------------------------------------------------------===// +// FS/GS Base Instructions +let Predicates = [HasFSGSBase, In64BitMode] in { + def RDFSBASE : I<0xAE, MRM0r, (outs GR32:$dst), (ins), + "rdfsbase{l}\t$dst", + [(set GR32:$dst, (int_x86_rdfsbase_32))]>, TB, XS; + def RDFSBASE64 : RI<0xAE, MRM0r, (outs GR64:$dst), (ins), + "rdfsbase{q}\t$dst", + [(set GR64:$dst, (int_x86_rdfsbase_64))]>, TB, XS; + def RDGSBASE : I<0xAE, MRM1r, (outs GR32:$dst), (ins), + "rdgsbase{l}\t$dst", + [(set GR32:$dst, (int_x86_rdgsbase_32))]>, TB, XS; + def RDGSBASE64 : RI<0xAE, MRM1r, (outs GR64:$dst), (ins), + "rdgsbase{q}\t$dst", + [(set GR64:$dst, (int_x86_rdgsbase_64))]>, TB, XS; + def WRFSBASE : I<0xAE, MRM2r, (outs), (ins GR32:$src), + "wrfsbase{l}\t$src", + [(int_x86_wrfsbase_32 GR32:$src)]>, TB, XS; + def WRFSBASE64 : RI<0xAE, MRM2r, (outs), (ins GR64:$src), + "wrfsbase{q}\t$src", + [(int_x86_wrfsbase_64 GR64:$src)]>, TB, XS; + def WRGSBASE : I<0xAE, MRM3r, (outs), (ins GR32:$src), + "wrgsbase{l}\t$src", + [(int_x86_wrgsbase_32 GR32:$src)]>, TB, XS; + def WRGSBASE64 : RI<0xAE, MRM3r, (outs), (ins GR64:$src), + "wrgsbase{q}\t$src", + [(int_x86_wrgsbase_64 GR64:$src)]>, TB, XS; +} + +//===----------------------------------------------------------------------===// +// INVPCID Instruction +def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2), + "invpcid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8, + Requires<[In32BitMode]>; +def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), + "invpcid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8, + Requires<[In64BitMode]>; + +//===----------------------------------------------------------------------===// +// SMAP Instruction +let Defs = [EFLAGS], Uses = [EFLAGS] in { + def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB; + def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB; +} diff --git a/contrib/llvm/lib/Target/X86/X86InstrTSX.td b/contrib/llvm/lib/Target/X86/X86InstrTSX.td new file mode 100644 index 0000000..363a190 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrTSX.td @@ -0,0 +1,39 @@ +//===-- X86InstrVMX.td - TSX Instruction Set Extension -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the instructions that make up the Intel TSX instruction +// set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// TSX instructions + +def X86xtest: SDNode<"X86ISD::XTEST", SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>, + [SDNPHasChain, SDNPSideEffect]>; + +let usesCustomInserter = 1 in +def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins), + "# XBEGIN", [(set GR32:$dst, (int_x86_xbegin))]>, + Requires<[HasRTM]>; + +let isBranch = 1, isTerminator = 1, Defs = [EAX] in +def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget:$dst), + "xbegin\t$dst", []>, Requires<[HasRTM]>; + +def XEND : I<0x01, MRM_D5, (outs), (ins), + "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>; + +let Defs = [EFLAGS] in +def XTEST : I<0x01, MRM_D6, (outs), (ins), + "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasTSX]>; + +def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm), + "xabort\t$imm", + [(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrVMX.td b/contrib/llvm/lib/Target/X86/X86InstrVMX.td new file mode 100644 index 0000000..6d3548f --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrVMX.td @@ -0,0 +1,66 @@ +//===-- X86InstrVMX.td - VMX Instruction Set Extension -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the instructions that make up the Intel VMX instruction +// set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// VMX instructions + +// 66 0F 38 80 +def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2), + "invept\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8, + Requires<[In32BitMode]>; +def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), + "invept\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8, + Requires<[In64BitMode]>; +// 66 0F 38 81 +def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2), + "invvpid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8, + Requires<[In32BitMode]>; +def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), + "invvpid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8, + Requires<[In64BitMode]>; +// 0F 01 C1 +def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB; +def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs), + "vmclear\t$vmcs", []>, OpSize, TB; +// OF 01 D4 +def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB; +// 0F 01 C2 +def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB; +// 0F 01 C3 +def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", []>, TB; +def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs), + "vmptrld\t$vmcs", []>, TB; +def VMPTRSTm : I<0xC7, MRM7m, (outs i64mem:$vmcs), (ins), + "vmptrst\t$vmcs", []>, TB; +def VMREAD64rm : I<0x78, MRMDestMem, (outs i64mem:$dst), (ins GR64:$src), + "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>; +def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), + "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>; +def VMREAD32rm : I<0x78, MRMDestMem, (outs i32mem:$dst), (ins GR32:$src), + "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In32BitMode]>; +def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), + "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In32BitMode]>; +def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>; +def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>; +def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In32BitMode]>; +def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In32BitMode]>; +// 0F 01 C4 +def VMXOFF : I<0x01, MRM_C4, (outs), (ins), "vmxoff", []>, TB; +def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon), + "vmxon\t$vmxon", []>, XS; + diff --git a/contrib/llvm/lib/Target/X86/X86InstrXOP.td b/contrib/llvm/lib/Target/X86/X86InstrXOP.td new file mode 100644 index 0000000..2aa08fa --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrXOP.td @@ -0,0 +1,311 @@ +//===-- X86InstrXOP.td - XOP Instruction Set ---------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes XOP (eXtended OPerations) +// +//===----------------------------------------------------------------------===// + +multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { + def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int VR128:$src))]>, VEX; + def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX; +} + +let isAsmParserOnly = 1 in { + defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, memopv2i64>; + defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, memopv2i64>; + defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, memopv2i64>; + defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, memopv2i64>; + defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, memopv2i64>; + defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, memopv2i64>; + defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, memopv2i64>; + defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, memopv2i64>; + defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, memopv2i64>; + defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, memopv2i64>; + defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, memopv2i64>; + defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, memopv2i64>; + defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, memopv2i64>; + defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, memopv2i64>; + defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, memopv2i64>; +} + +// Scalar load 2 addr operand instructions +multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int, + Operand memop, ComplexPattern mem_cpat> { + def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int VR128:$src))]>, VEX; + def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, VEX; +} + +let isAsmParserOnly = 1 in { + defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss, + ssmem, sse_load_f32>; + defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd, + sdmem, sse_load_f64>; +} + +multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int, + PatFrag memop> { + def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int VR128:$src))]>, VEX; + def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX; +} + +let isAsmParserOnly = 1 in { + defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>; + defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>; +} + +multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int, + PatFrag memop> { + def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (Int VR256:$src))]>, VEX, VEX_L; + def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX, VEX_L; +} + +let isAsmParserOnly = 1 in { + defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, + memopv8f32>; + defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, + memopv4f64>; +} + +multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> { + def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, (Int VR128:$src1, VR128:$src2))]>, VEX_4VOp3; + def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))]>, + VEX_4V, VEX_W; + def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (Int (bitconvert (memopv2i64 addr:$src1)), VR128:$src2))]>, + VEX_4VOp3; +} + +let isAsmParserOnly = 1 in { + defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>; + defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>; + defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>; + defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>; + defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>; + defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>; + defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>; + defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>; + defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>; + defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>; + defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>; + defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>; +} + +multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> { + def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, i8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, (Int VR128:$src1, imm:$src2))]>, VEX; + def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1, i8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (Int (bitconvert (memopv2i64 addr:$src1)), imm:$src2))]>, VEX; +} + +let isAsmParserOnly = 1 in { + defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>; + defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>; + defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>; + defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>; +} + +// Instruction where second source can be memory, but third must be register +multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> { + def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_4V, VEX_I8IMM; + def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)), + VR128:$src3))]>, VEX_4V, VEX_I8IMM; +} + +let isAsmParserOnly = 1 in { + defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>; + defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>; + defm VPMACSWW : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>; + defm VPMACSWD : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>; + defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>; + defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>; + defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>; + defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>; + defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>; + defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>; + defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>; + defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>; +} + +// Instruction where second source can be memory, third must be imm8 +multiclass xop4opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> { + def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, imm:$src3))]>, + VEX_4V; + def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)), + imm:$src3))]>, VEX_4V; +} + +let isAsmParserOnly = 1 in { + defm VPCOMB : xop4opimm<0xCC, "vpcomb", int_x86_xop_vpcomb>; + defm VPCOMW : xop4opimm<0xCD, "vpcomw", int_x86_xop_vpcomw>; + defm VPCOMD : xop4opimm<0xCE, "vpcomd", int_x86_xop_vpcomd>; + defm VPCOMQ : xop4opimm<0xCF, "vpcomq", int_x86_xop_vpcomq>; + defm VPCOMUB : xop4opimm<0xEC, "vpcomub", int_x86_xop_vpcomub>; + defm VPCOMUW : xop4opimm<0xED, "vpcomuw", int_x86_xop_vpcomuw>; + defm VPCOMUD : xop4opimm<0xEE, "vpcomud", int_x86_xop_vpcomud>; + defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", int_x86_xop_vpcomuq>; +} + +// Instruction where either second or third source can be memory +multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> { + def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, + VEX_4V, VEX_I8IMM; + def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, i128mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, VR128:$src2, + (bitconvert (memopv2i64 addr:$src3))))]>, + VEX_4V, VEX_I8IMM, VEX_W, MemOp4; + def mr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)), + VR128:$src3))]>, + VEX_4V, VEX_I8IMM; +} + +let isAsmParserOnly = 1 in { + defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>; + defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>; +} + +multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> { + def rrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR256:$dst, (Int VR256:$src1, VR256:$src2, VR256:$src3))]>, + VEX_4V, VEX_I8IMM, VEX_L; + def rmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, i256mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR256:$dst, + (Int VR256:$src1, VR256:$src2, + (bitconvert (memopv4i64 addr:$src3))))]>, + VEX_4V, VEX_I8IMM, VEX_W, MemOp4, VEX_L; + def mrY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR256:$dst, + (Int VR256:$src1, (bitconvert (memopv4i64 addr:$src2)), + VR256:$src3))]>, + VEX_4V, VEX_I8IMM, VEX_L; +} + +let isAsmParserOnly = 1 in { + defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>; +} + +multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128, + Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> { + def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3, i8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + [(set VR128:$dst, + (Int128 VR128:$src1, VR128:$src2, VR128:$src3, imm:$src4))]>; + def rm : IXOP5<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, f128mem:$src3, i8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + [(set VR128:$dst, + (Int128 VR128:$src1, VR128:$src2, (ld_128 addr:$src3), imm:$src4))]>, + VEX_W, MemOp4; + def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2, VR128:$src3, i8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + [(set VR128:$dst, + (Int128 VR128:$src1, (ld_128 addr:$src2), VR128:$src3, imm:$src4))]>; + def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, VR256:$src3, i8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + [(set VR256:$dst, + (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>, VEX_L; + def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, f256mem:$src3, i8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + [(set VR256:$dst, + (Int256 VR256:$src1, VR256:$src2, (ld_256 addr:$src3), imm:$src4))]>, + VEX_W, MemOp4, VEX_L; + def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2, VR256:$src3, i8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + [(set VR256:$dst, + (Int256 VR256:$src1, (ld_256 addr:$src2), VR256:$src3, imm:$src4))]>, + VEX_L; +} + +defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd, + int_x86_xop_vpermil2pd_256, memopv2f64, memopv4f64>; +defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps, + int_x86_xop_vpermil2ps_256, memopv4f32, memopv8f32>; + diff --git a/contrib/llvm/lib/Target/X86/X86JITInfo.cpp b/contrib/llvm/lib/Target/X86/X86JITInfo.cpp new file mode 100644 index 0000000..44d8cce --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86JITInfo.cpp @@ -0,0 +1,581 @@ +//===-- X86JITInfo.cpp - Implement the JIT interfaces for the X86 target --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the JIT interfaces for the X86 target. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "jit" +#include "X86JITInfo.h" +#include "X86Relocations.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Valgrind.h" +#include <cstdlib> +#include <cstring> +using namespace llvm; + +// Determine the platform we're running on +#if defined (__x86_64__) || defined (_M_AMD64) || defined (_M_X64) +# define X86_64_JIT +#elif defined(__i386__) || defined(i386) || defined(_M_IX86) +# define X86_32_JIT +#endif + +void X86JITInfo::replaceMachineCodeForFunction(void *Old, void *New) { + unsigned char *OldByte = (unsigned char *)Old; + *OldByte++ = 0xE9; // Emit JMP opcode. + unsigned *OldWord = (unsigned *)OldByte; + unsigned NewAddr = (intptr_t)New; + unsigned OldAddr = (intptr_t)OldWord; + *OldWord = NewAddr - OldAddr - 4; // Emit PC-relative addr of New code. + + // X86 doesn't need to invalidate the processor cache, so just invalidate + // Valgrind's cache directly. + sys::ValgrindDiscardTranslations(Old, 5); +} + + +/// JITCompilerFunction - This contains the address of the JIT function used to +/// compile a function lazily. +static TargetJITInfo::JITCompilerFn JITCompilerFunction; + +// Get the ASMPREFIX for the current host. This is often '_'. +#ifndef __USER_LABEL_PREFIX__ +#define __USER_LABEL_PREFIX__ +#endif +#define GETASMPREFIX2(X) #X +#define GETASMPREFIX(X) GETASMPREFIX2(X) +#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__) + +// For ELF targets, use a .size and .type directive, to let tools +// know the extent of functions defined in assembler. +#if defined(__ELF__) +# define SIZE(sym) ".size " #sym ", . - " #sym "\n" +# define TYPE_FUNCTION(sym) ".type " #sym ", @function\n" +#else +# define SIZE(sym) +# define TYPE_FUNCTION(sym) +#endif + +// Provide a convenient way for disabling usage of CFI directives. +// This is needed for old/broken assemblers (for example, gas on +// Darwin is pretty old and doesn't support these directives) +#if defined(__APPLE__) +# define CFI(x) +#else +// FIXME: Disable this until we really want to use it. Also, we will +// need to add some workarounds for compilers, which support +// only subset of these directives. +# define CFI(x) +#endif + +// Provide a wrapper for LLVMX86CompilationCallback2 that saves non-traditional +// callee saved registers, for the fastcc calling convention. +extern "C" { +#if defined(X86_64_JIT) +# ifndef _MSC_VER + // No need to save EAX/EDX for X86-64. + void X86CompilationCallback(void); + asm( + ".text\n" + ".align 8\n" + ".globl " ASMPREFIX "X86CompilationCallback\n" + TYPE_FUNCTION(X86CompilationCallback) + ASMPREFIX "X86CompilationCallback:\n" + CFI(".cfi_startproc\n") + // Save RBP + "pushq %rbp\n" + CFI(".cfi_def_cfa_offset 16\n") + CFI(".cfi_offset %rbp, -16\n") + // Save RSP + "movq %rsp, %rbp\n" + CFI(".cfi_def_cfa_register %rbp\n") + // Save all int arg registers + "pushq %rdi\n" + CFI(".cfi_rel_offset %rdi, 0\n") + "pushq %rsi\n" + CFI(".cfi_rel_offset %rsi, 8\n") + "pushq %rdx\n" + CFI(".cfi_rel_offset %rdx, 16\n") + "pushq %rcx\n" + CFI(".cfi_rel_offset %rcx, 24\n") + "pushq %r8\n" + CFI(".cfi_rel_offset %r8, 32\n") + "pushq %r9\n" + CFI(".cfi_rel_offset %r9, 40\n") + // Align stack on 16-byte boundary. ESP might not be properly aligned + // (8 byte) if this is called from an indirect stub. + "andq $-16, %rsp\n" + // Save all XMM arg registers + "subq $128, %rsp\n" + "movaps %xmm0, (%rsp)\n" + "movaps %xmm1, 16(%rsp)\n" + "movaps %xmm2, 32(%rsp)\n" + "movaps %xmm3, 48(%rsp)\n" + "movaps %xmm4, 64(%rsp)\n" + "movaps %xmm5, 80(%rsp)\n" + "movaps %xmm6, 96(%rsp)\n" + "movaps %xmm7, 112(%rsp)\n" + // JIT callee +#ifdef _WIN64 + "subq $32, %rsp\n" + "movq %rbp, %rcx\n" // Pass prev frame and return address + "movq 8(%rbp), %rdx\n" + "call " ASMPREFIX "LLVMX86CompilationCallback2\n" + "addq $32, %rsp\n" +#else + "movq %rbp, %rdi\n" // Pass prev frame and return address + "movq 8(%rbp), %rsi\n" + "call " ASMPREFIX "LLVMX86CompilationCallback2\n" +#endif + // Restore all XMM arg registers + "movaps 112(%rsp), %xmm7\n" + "movaps 96(%rsp), %xmm6\n" + "movaps 80(%rsp), %xmm5\n" + "movaps 64(%rsp), %xmm4\n" + "movaps 48(%rsp), %xmm3\n" + "movaps 32(%rsp), %xmm2\n" + "movaps 16(%rsp), %xmm1\n" + "movaps (%rsp), %xmm0\n" + // Restore RSP + "movq %rbp, %rsp\n" + CFI(".cfi_def_cfa_register %rsp\n") + // Restore all int arg registers + "subq $48, %rsp\n" + CFI(".cfi_adjust_cfa_offset 48\n") + "popq %r9\n" + CFI(".cfi_adjust_cfa_offset -8\n") + CFI(".cfi_restore %r9\n") + "popq %r8\n" + CFI(".cfi_adjust_cfa_offset -8\n") + CFI(".cfi_restore %r8\n") + "popq %rcx\n" + CFI(".cfi_adjust_cfa_offset -8\n") + CFI(".cfi_restore %rcx\n") + "popq %rdx\n" + CFI(".cfi_adjust_cfa_offset -8\n") + CFI(".cfi_restore %rdx\n") + "popq %rsi\n" + CFI(".cfi_adjust_cfa_offset -8\n") + CFI(".cfi_restore %rsi\n") + "popq %rdi\n" + CFI(".cfi_adjust_cfa_offset -8\n") + CFI(".cfi_restore %rdi\n") + // Restore RBP + "popq %rbp\n" + CFI(".cfi_adjust_cfa_offset -8\n") + CFI(".cfi_restore %rbp\n") + "ret\n" + CFI(".cfi_endproc\n") + SIZE(X86CompilationCallback) + ); +# else + // No inline assembler support on this platform. The routine is in external + // file. + void X86CompilationCallback(); + +# endif +#elif defined (X86_32_JIT) +# ifndef _MSC_VER + void X86CompilationCallback(void); + asm( + ".text\n" + ".align 8\n" + ".globl " ASMPREFIX "X86CompilationCallback\n" + TYPE_FUNCTION(X86CompilationCallback) + ASMPREFIX "X86CompilationCallback:\n" + CFI(".cfi_startproc\n") + "pushl %ebp\n" + CFI(".cfi_def_cfa_offset 8\n") + CFI(".cfi_offset %ebp, -8\n") + "movl %esp, %ebp\n" // Standard prologue + CFI(".cfi_def_cfa_register %ebp\n") + "pushl %eax\n" + CFI(".cfi_rel_offset %eax, 0\n") + "pushl %edx\n" // Save EAX/EDX/ECX + CFI(".cfi_rel_offset %edx, 4\n") + "pushl %ecx\n" + CFI(".cfi_rel_offset %ecx, 8\n") +# if defined(__APPLE__) + "andl $-16, %esp\n" // Align ESP on 16-byte boundary +# endif + "subl $16, %esp\n" + "movl 4(%ebp), %eax\n" // Pass prev frame and return address + "movl %eax, 4(%esp)\n" + "movl %ebp, (%esp)\n" + "call " ASMPREFIX "LLVMX86CompilationCallback2\n" + "movl %ebp, %esp\n" // Restore ESP + CFI(".cfi_def_cfa_register %esp\n") + "subl $12, %esp\n" + CFI(".cfi_adjust_cfa_offset 12\n") + "popl %ecx\n" + CFI(".cfi_adjust_cfa_offset -4\n") + CFI(".cfi_restore %ecx\n") + "popl %edx\n" + CFI(".cfi_adjust_cfa_offset -4\n") + CFI(".cfi_restore %edx\n") + "popl %eax\n" + CFI(".cfi_adjust_cfa_offset -4\n") + CFI(".cfi_restore %eax\n") + "popl %ebp\n" + CFI(".cfi_adjust_cfa_offset -4\n") + CFI(".cfi_restore %ebp\n") + "ret\n" + CFI(".cfi_endproc\n") + SIZE(X86CompilationCallback) + ); + + // Same as X86CompilationCallback but also saves XMM argument registers. + void X86CompilationCallback_SSE(void); + asm( + ".text\n" + ".align 8\n" + ".globl " ASMPREFIX "X86CompilationCallback_SSE\n" + TYPE_FUNCTION(X86CompilationCallback_SSE) + ASMPREFIX "X86CompilationCallback_SSE:\n" + CFI(".cfi_startproc\n") + "pushl %ebp\n" + CFI(".cfi_def_cfa_offset 8\n") + CFI(".cfi_offset %ebp, -8\n") + "movl %esp, %ebp\n" // Standard prologue + CFI(".cfi_def_cfa_register %ebp\n") + "pushl %eax\n" + CFI(".cfi_rel_offset %eax, 0\n") + "pushl %edx\n" // Save EAX/EDX/ECX + CFI(".cfi_rel_offset %edx, 4\n") + "pushl %ecx\n" + CFI(".cfi_rel_offset %ecx, 8\n") + "andl $-16, %esp\n" // Align ESP on 16-byte boundary + // Save all XMM arg registers + "subl $64, %esp\n" + // FIXME: provide frame move information for xmm registers. + // This can be tricky, because CFA register is ebp (unaligned) + // and we need to produce offsets relative to it. + "movaps %xmm0, (%esp)\n" + "movaps %xmm1, 16(%esp)\n" + "movaps %xmm2, 32(%esp)\n" + "movaps %xmm3, 48(%esp)\n" + "subl $16, %esp\n" + "movl 4(%ebp), %eax\n" // Pass prev frame and return address + "movl %eax, 4(%esp)\n" + "movl %ebp, (%esp)\n" + "call " ASMPREFIX "LLVMX86CompilationCallback2\n" + "addl $16, %esp\n" + "movaps 48(%esp), %xmm3\n" + CFI(".cfi_restore %xmm3\n") + "movaps 32(%esp), %xmm2\n" + CFI(".cfi_restore %xmm2\n") + "movaps 16(%esp), %xmm1\n" + CFI(".cfi_restore %xmm1\n") + "movaps (%esp), %xmm0\n" + CFI(".cfi_restore %xmm0\n") + "movl %ebp, %esp\n" // Restore ESP + CFI(".cfi_def_cfa_register esp\n") + "subl $12, %esp\n" + CFI(".cfi_adjust_cfa_offset 12\n") + "popl %ecx\n" + CFI(".cfi_adjust_cfa_offset -4\n") + CFI(".cfi_restore %ecx\n") + "popl %edx\n" + CFI(".cfi_adjust_cfa_offset -4\n") + CFI(".cfi_restore %edx\n") + "popl %eax\n" + CFI(".cfi_adjust_cfa_offset -4\n") + CFI(".cfi_restore %eax\n") + "popl %ebp\n" + CFI(".cfi_adjust_cfa_offset -4\n") + CFI(".cfi_restore %ebp\n") + "ret\n" + CFI(".cfi_endproc\n") + SIZE(X86CompilationCallback_SSE) + ); +# else + void LLVMX86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr); + + _declspec(naked) void X86CompilationCallback(void) { + __asm { + push ebp + mov ebp, esp + push eax + push edx + push ecx + and esp, -16 + sub esp, 16 + mov eax, dword ptr [ebp+4] + mov dword ptr [esp+4], eax + mov dword ptr [esp], ebp + call LLVMX86CompilationCallback2 + mov esp, ebp + sub esp, 12 + pop ecx + pop edx + pop eax + pop ebp + ret + } + } + +# endif // _MSC_VER + +#else // Not an i386 host + void X86CompilationCallback() { + llvm_unreachable("Cannot call X86CompilationCallback() on a non-x86 arch!"); + } +#endif +} + +/// This is the target-specific function invoked by the +/// function stub when we did not know the real target of a call. This function +/// must locate the start of the stub or call site and pass it into the JIT +/// compiler function. +extern "C" { +LLVM_LIBRARY_VISIBILITY void LLVMX86CompilationCallback2(intptr_t *StackPtr, + intptr_t RetAddr) { + intptr_t *RetAddrLoc = &StackPtr[1]; + // We are reading raw stack data here. Tell MemorySanitizer that it is + // sufficiently initialized. + __msan_unpoison(RetAddrLoc, sizeof(*RetAddrLoc)); + assert(*RetAddrLoc == RetAddr && + "Could not find return address on the stack!"); + + // It's a stub if there is an interrupt marker after the call. + bool isStub = ((unsigned char*)RetAddr)[0] == 0xCE; + + // The call instruction should have pushed the return value onto the stack... +#if defined (X86_64_JIT) + RetAddr--; // Backtrack to the reference itself... +#else + RetAddr -= 4; // Backtrack to the reference itself... +#endif + +#if 0 + DEBUG(dbgs() << "In callback! Addr=" << (void*)RetAddr + << " ESP=" << (void*)StackPtr + << ": Resolving call to function: " + << TheVM->getFunctionReferencedName((void*)RetAddr) << "\n"); +#endif + + // Sanity check to make sure this really is a call instruction. +#if defined (X86_64_JIT) + assert(((unsigned char*)RetAddr)[-2] == 0x41 &&"Not a call instr!"); + assert(((unsigned char*)RetAddr)[-1] == 0xFF &&"Not a call instr!"); +#else + assert(((unsigned char*)RetAddr)[-1] == 0xE8 &&"Not a call instr!"); +#endif + + intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)RetAddr); + + // Rewrite the call target... so that we don't end up here every time we + // execute the call. +#if defined (X86_64_JIT) + assert(isStub && + "X86-64 doesn't support rewriting non-stub lazy compilation calls:" + " the call instruction varies too much."); +#else + *(intptr_t *)RetAddr = (intptr_t)(NewVal-RetAddr-4); +#endif + + if (isStub) { + // If this is a stub, rewrite the call into an unconditional branch + // instruction so that two return addresses are not pushed onto the stack + // when the requested function finally gets called. This also makes the + // 0xCE byte (interrupt) dead, so the marker doesn't effect anything. +#if defined (X86_64_JIT) + // If the target address is within 32-bit range of the stub, use a + // PC-relative branch instead of loading the actual address. (This is + // considerably shorter than the 64-bit immediate load already there.) + // We assume here intptr_t is 64 bits. + intptr_t diff = NewVal-RetAddr+7; + if (diff >= -2147483648LL && diff <= 2147483647LL) { + *(unsigned char*)(RetAddr-0xc) = 0xE9; + *(intptr_t *)(RetAddr-0xb) = diff & 0xffffffff; + } else { + *(intptr_t *)(RetAddr - 0xa) = NewVal; + ((unsigned char*)RetAddr)[0] = (2 | (4 << 3) | (3 << 6)); + } + sys::ValgrindDiscardTranslations((void*)(RetAddr-0xc), 0xd); +#else + ((unsigned char*)RetAddr)[-1] = 0xE9; + sys::ValgrindDiscardTranslations((void*)(RetAddr-1), 5); +#endif + } + + // Change the return address to reexecute the call instruction... +#if defined (X86_64_JIT) + *RetAddrLoc -= 0xd; +#else + *RetAddrLoc -= 5; +#endif +} +} + +TargetJITInfo::LazyResolverFn +X86JITInfo::getLazyResolverFunction(JITCompilerFn F) { + TsanIgnoreWritesBegin(); + JITCompilerFunction = F; + TsanIgnoreWritesEnd(); + +#if defined (X86_32_JIT) && !defined (_MSC_VER) + if (Subtarget->hasSSE1()) + return X86CompilationCallback_SSE; +#endif + + return X86CompilationCallback; +} + +X86JITInfo::X86JITInfo(X86TargetMachine &tm) : TM(tm) { + Subtarget = &TM.getSubtarget<X86Subtarget>(); + useGOT = 0; + TLSOffset = 0; +} + +void *X86JITInfo::emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr, + JITCodeEmitter &JCE) { +#if defined (X86_64_JIT) + const unsigned Alignment = 8; + uint8_t Buffer[8]; + uint8_t *Cur = Buffer; + MachineCodeEmitter::emitWordLEInto(Cur, (unsigned)(intptr_t)ptr); + MachineCodeEmitter::emitWordLEInto(Cur, (unsigned)(((intptr_t)ptr) >> 32)); +#else + const unsigned Alignment = 4; + uint8_t Buffer[4]; + uint8_t *Cur = Buffer; + MachineCodeEmitter::emitWordLEInto(Cur, (intptr_t)ptr); +#endif + return JCE.allocIndirectGV(GV, Buffer, sizeof(Buffer), Alignment); +} + +TargetJITInfo::StubLayout X86JITInfo::getStubLayout() { + // The 64-bit stub contains: + // movabs r10 <- 8-byte-target-address # 10 bytes + // call|jmp *r10 # 3 bytes + // The 32-bit stub contains a 5-byte call|jmp. + // If the stub is a call to the compilation callback, an extra byte is added + // to mark it as a stub. + StubLayout Result = {14, 4}; + return Result; +} + +void *X86JITInfo::emitFunctionStub(const Function* F, void *Target, + JITCodeEmitter &JCE) { + // Note, we cast to intptr_t here to silence a -pedantic warning that + // complains about casting a function pointer to a normal pointer. +#if defined (X86_32_JIT) && !defined (_MSC_VER) + bool NotCC = (Target != (void*)(intptr_t)X86CompilationCallback && + Target != (void*)(intptr_t)X86CompilationCallback_SSE); +#else + bool NotCC = Target != (void*)(intptr_t)X86CompilationCallback; +#endif + JCE.emitAlignment(4); + void *Result = (void*)JCE.getCurrentPCValue(); + if (NotCC) { +#if defined (X86_64_JIT) + JCE.emitByte(0x49); // REX prefix + JCE.emitByte(0xB8+2); // movabsq r10 + JCE.emitWordLE((unsigned)(intptr_t)Target); + JCE.emitWordLE((unsigned)(((intptr_t)Target) >> 32)); + JCE.emitByte(0x41); // REX prefix + JCE.emitByte(0xFF); // jmpq *r10 + JCE.emitByte(2 | (4 << 3) | (3 << 6)); +#else + JCE.emitByte(0xE9); + JCE.emitWordLE((intptr_t)Target-JCE.getCurrentPCValue()-4); +#endif + return Result; + } + +#if defined (X86_64_JIT) + JCE.emitByte(0x49); // REX prefix + JCE.emitByte(0xB8+2); // movabsq r10 + JCE.emitWordLE((unsigned)(intptr_t)Target); + JCE.emitWordLE((unsigned)(((intptr_t)Target) >> 32)); + JCE.emitByte(0x41); // REX prefix + JCE.emitByte(0xFF); // callq *r10 + JCE.emitByte(2 | (2 << 3) | (3 << 6)); +#else + JCE.emitByte(0xE8); // Call with 32 bit pc-rel destination... + + JCE.emitWordLE((intptr_t)Target-JCE.getCurrentPCValue()-4); +#endif + + // This used to use 0xCD, but that value is used by JITMemoryManager to + // initialize the buffer with garbage, which means it may follow a + // noreturn function call, confusing LLVMX86CompilationCallback2. PR 4929. + JCE.emitByte(0xCE); // Interrupt - Just a marker identifying the stub! + return Result; +} + +/// getPICJumpTableEntry - Returns the value of the jumptable entry for the +/// specific basic block. +uintptr_t X86JITInfo::getPICJumpTableEntry(uintptr_t BB, uintptr_t Entry) { +#if defined(X86_64_JIT) + return BB - Entry; +#else + return BB - PICBase; +#endif +} + +template<typename T> static void addUnaligned(void *Pos, T Delta) { + T Value; + std::memcpy(reinterpret_cast<char*>(&Value), reinterpret_cast<char*>(Pos), + sizeof(T)); + Value += Delta; + std::memcpy(reinterpret_cast<char*>(Pos), reinterpret_cast<char*>(&Value), + sizeof(T)); +} + +/// relocate - Before the JIT can run a block of code that has been emitted, +/// it must rewrite the code to contain the actual addresses of any +/// referenced global symbols. +void X86JITInfo::relocate(void *Function, MachineRelocation *MR, + unsigned NumRelocs, unsigned char* GOTBase) { + for (unsigned i = 0; i != NumRelocs; ++i, ++MR) { + void *RelocPos = (char*)Function + MR->getMachineCodeOffset(); + intptr_t ResultPtr = (intptr_t)MR->getResultPointer(); + switch ((X86::RelocationType)MR->getRelocationType()) { + case X86::reloc_pcrel_word: { + // PC relative relocation, add the relocated value to the value already in + // memory, after we adjust it for where the PC is. + ResultPtr = ResultPtr -(intptr_t)RelocPos - 4 - MR->getConstantVal(); + addUnaligned<unsigned>(RelocPos, ResultPtr); + break; + } + case X86::reloc_picrel_word: { + // PIC base relative relocation, add the relocated value to the value + // already in memory, after we adjust it for where the PIC base is. + ResultPtr = ResultPtr - ((intptr_t)Function + MR->getConstantVal()); + addUnaligned<unsigned>(RelocPos, ResultPtr); + break; + } + case X86::reloc_absolute_word: + case X86::reloc_absolute_word_sext: + // Absolute relocation, just add the relocated value to the value already + // in memory. + addUnaligned<unsigned>(RelocPos, ResultPtr); + break; + case X86::reloc_absolute_dword: + addUnaligned<intptr_t>(RelocPos, ResultPtr); + break; + } + } +} + +char* X86JITInfo::allocateThreadLocalMemory(size_t size) { +#if defined(X86_32_JIT) && !defined(__APPLE__) && !defined(_MSC_VER) + TLSOffset -= size; + return TLSOffset; +#else + llvm_unreachable("Cannot allocate thread local storage on this arch!"); +#endif +} diff --git a/contrib/llvm/lib/Target/X86/X86JITInfo.h b/contrib/llvm/lib/Target/X86/X86JITInfo.h new file mode 100644 index 0000000..f916327 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86JITInfo.h @@ -0,0 +1,81 @@ +//===-- X86JITInfo.h - X86 implementation of the JIT interface --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of the TargetJITInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef X86JITINFO_H +#define X86JITINFO_H + +#include "llvm/CodeGen/JITCodeEmitter.h" +#include "llvm/IR/Function.h" +#include "llvm/Target/TargetJITInfo.h" + +namespace llvm { + class X86TargetMachine; + class X86Subtarget; + + class X86JITInfo : public TargetJITInfo { + X86TargetMachine &TM; + const X86Subtarget *Subtarget; + uintptr_t PICBase; + char* TLSOffset; + public: + explicit X86JITInfo(X86TargetMachine &tm); + + /// replaceMachineCodeForFunction - Make it so that calling the function + /// whose machine code is at OLD turns into a call to NEW, perhaps by + /// overwriting OLD with a branch to NEW. This is used for self-modifying + /// code. + /// + virtual void replaceMachineCodeForFunction(void *Old, void *New); + + /// emitGlobalValueIndirectSym - Use the specified JITCodeEmitter object + /// to emit an indirect symbol which contains the address of the specified + /// ptr. + virtual void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr, + JITCodeEmitter &JCE); + + // getStubLayout - Returns the size and alignment of the largest call stub + // on X86. + virtual StubLayout getStubLayout(); + + /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a + /// small native function that simply calls the function at the specified + /// address. + virtual void *emitFunctionStub(const Function* F, void *Target, + JITCodeEmitter &JCE); + + /// getPICJumpTableEntry - Returns the value of the jumptable entry for the + /// specific basic block. + virtual uintptr_t getPICJumpTableEntry(uintptr_t BB, uintptr_t JTBase); + + /// getLazyResolverFunction - Expose the lazy resolver to the JIT. + virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn); + + /// relocate - Before the JIT can run a block of code that has been emitted, + /// it must rewrite the code to contain the actual addresses of any + /// referenced global symbols. + virtual void relocate(void *Function, MachineRelocation *MR, + unsigned NumRelocs, unsigned char* GOTBase); + + /// allocateThreadLocalMemory - Each target has its own way of + /// handling thread local variables. This method returns a value only + /// meaningful to the target. + virtual char* allocateThreadLocalMemory(size_t size); + + /// setPICBase / getPICBase - Getter / setter of PICBase, used to compute + /// PIC jumptable entry. + void setPICBase(uintptr_t Base) { PICBase = Base; } + uintptr_t getPICBase() const { return PICBase; } + }; +} + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp new file mode 100644 index 0000000..a8a9fd8 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -0,0 +1,794 @@ +//===-- X86MCInstLower.cpp - Convert X86 MachineInstr to an MCInst --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains code to lower X86 MachineInstrs to their corresponding +// MCInst records. +// +//===----------------------------------------------------------------------===// + +#include "X86AsmPrinter.h" +#include "InstPrinter/X86ATTInstPrinter.h" +#include "X86COFFMachineModuleInfo.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/IR/Type.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstBuilder.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Target/Mangler.h" +using namespace llvm; + +namespace { + +/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst. +class X86MCInstLower { + MCContext &Ctx; + Mangler *Mang; + const MachineFunction &MF; + const TargetMachine &TM; + const MCAsmInfo &MAI; + X86AsmPrinter &AsmPrinter; +public: + X86MCInstLower(Mangler *mang, const MachineFunction &MF, + X86AsmPrinter &asmprinter); + + void Lower(const MachineInstr *MI, MCInst &OutMI) const; + + MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const; + MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const; + +private: + MachineModuleInfoMachO &getMachOMMI() const; +}; + +} // end anonymous namespace + +X86MCInstLower::X86MCInstLower(Mangler *mang, const MachineFunction &mf, + X86AsmPrinter &asmprinter) +: Ctx(mf.getContext()), Mang(mang), MF(mf), TM(mf.getTarget()), + MAI(*TM.getMCAsmInfo()), AsmPrinter(asmprinter) {} + +MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const { + return MF.getMMI().getObjFileInfo<MachineModuleInfoMachO>(); +} + + +/// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol +/// operand to an MCSymbol. +MCSymbol *X86MCInstLower:: +GetSymbolFromOperand(const MachineOperand &MO) const { + assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference"); + + SmallString<128> Name; + + if (MO.isGlobal()) { + const GlobalValue *GV = MO.getGlobal(); + bool isImplicitlyPrivate = false; + if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB || + MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY || + MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE || + MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE) + isImplicitlyPrivate = true; + + Mang->getNameWithPrefix(Name, GV, isImplicitlyPrivate); + } else if (MO.isSymbol()) { + Name += MAI.getGlobalPrefix(); + Name += MO.getSymbolName(); + } else if (MO.isMBB()) { + Name += MO.getMBB()->getSymbol()->getName(); + } + + // If the target flags on the operand changes the name of the symbol, do that + // before we return the symbol. + switch (MO.getTargetFlags()) { + default: break; + case X86II::MO_DLLIMPORT: { + // Handle dllimport linkage. + const char *Prefix = "__imp_"; + Name.insert(Name.begin(), Prefix, Prefix+strlen(Prefix)); + break; + } + case X86II::MO_DARWIN_NONLAZY: + case X86II::MO_DARWIN_NONLAZY_PIC_BASE: { + Name += "$non_lazy_ptr"; + MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str()); + + MachineModuleInfoImpl::StubValueTy &StubSym = + getMachOMMI().getGVStubEntry(Sym); + if (StubSym.getPointer() == 0) { + assert(MO.isGlobal() && "Extern symbol not handled yet"); + StubSym = + MachineModuleInfoImpl:: + StubValueTy(Mang->getSymbol(MO.getGlobal()), + !MO.getGlobal()->hasInternalLinkage()); + } + return Sym; + } + case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: { + Name += "$non_lazy_ptr"; + MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str()); + MachineModuleInfoImpl::StubValueTy &StubSym = + getMachOMMI().getHiddenGVStubEntry(Sym); + if (StubSym.getPointer() == 0) { + assert(MO.isGlobal() && "Extern symbol not handled yet"); + StubSym = + MachineModuleInfoImpl:: + StubValueTy(Mang->getSymbol(MO.getGlobal()), + !MO.getGlobal()->hasInternalLinkage()); + } + return Sym; + } + case X86II::MO_DARWIN_STUB: { + Name += "$stub"; + MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str()); + MachineModuleInfoImpl::StubValueTy &StubSym = + getMachOMMI().getFnStubEntry(Sym); + if (StubSym.getPointer()) + return Sym; + + if (MO.isGlobal()) { + StubSym = + MachineModuleInfoImpl:: + StubValueTy(Mang->getSymbol(MO.getGlobal()), + !MO.getGlobal()->hasInternalLinkage()); + } else { + Name.erase(Name.end()-5, Name.end()); + StubSym = + MachineModuleInfoImpl:: + StubValueTy(Ctx.GetOrCreateSymbol(Name.str()), false); + } + return Sym; + } + } + + return Ctx.GetOrCreateSymbol(Name.str()); +} + +MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO, + MCSymbol *Sym) const { + // FIXME: We would like an efficient form for this, so we don't have to do a + // lot of extra uniquing. + const MCExpr *Expr = 0; + MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None; + + switch (MO.getTargetFlags()) { + default: llvm_unreachable("Unknown target flag on GV operand"); + case X86II::MO_NO_FLAG: // No flag. + // These affect the name of the symbol, not any suffix. + case X86II::MO_DARWIN_NONLAZY: + case X86II::MO_DLLIMPORT: + case X86II::MO_DARWIN_STUB: + break; + + case X86II::MO_TLVP: RefKind = MCSymbolRefExpr::VK_TLVP; break; + case X86II::MO_TLVP_PIC_BASE: + Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx); + // Subtract the pic base. + Expr = MCBinaryExpr::CreateSub(Expr, + MCSymbolRefExpr::Create(MF.getPICBaseSymbol(), + Ctx), + Ctx); + break; + case X86II::MO_SECREL: RefKind = MCSymbolRefExpr::VK_SECREL; break; + case X86II::MO_TLSGD: RefKind = MCSymbolRefExpr::VK_TLSGD; break; + case X86II::MO_TLSLD: RefKind = MCSymbolRefExpr::VK_TLSLD; break; + case X86II::MO_TLSLDM: RefKind = MCSymbolRefExpr::VK_TLSLDM; break; + case X86II::MO_GOTTPOFF: RefKind = MCSymbolRefExpr::VK_GOTTPOFF; break; + case X86II::MO_INDNTPOFF: RefKind = MCSymbolRefExpr::VK_INDNTPOFF; break; + case X86II::MO_TPOFF: RefKind = MCSymbolRefExpr::VK_TPOFF; break; + case X86II::MO_DTPOFF: RefKind = MCSymbolRefExpr::VK_DTPOFF; break; + case X86II::MO_NTPOFF: RefKind = MCSymbolRefExpr::VK_NTPOFF; break; + case X86II::MO_GOTNTPOFF: RefKind = MCSymbolRefExpr::VK_GOTNTPOFF; break; + case X86II::MO_GOTPCREL: RefKind = MCSymbolRefExpr::VK_GOTPCREL; break; + case X86II::MO_GOT: RefKind = MCSymbolRefExpr::VK_GOT; break; + case X86II::MO_GOTOFF: RefKind = MCSymbolRefExpr::VK_GOTOFF; break; + case X86II::MO_PLT: RefKind = MCSymbolRefExpr::VK_PLT; break; + case X86II::MO_PIC_BASE_OFFSET: + case X86II::MO_DARWIN_NONLAZY_PIC_BASE: + case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: + Expr = MCSymbolRefExpr::Create(Sym, Ctx); + // Subtract the pic base. + Expr = MCBinaryExpr::CreateSub(Expr, + MCSymbolRefExpr::Create(MF.getPICBaseSymbol(), Ctx), + Ctx); + if (MO.isJTI() && MAI.hasSetDirective()) { + // If .set directive is supported, use it to reduce the number of + // relocations the assembler will generate for differences between + // local labels. This is only safe when the symbols are in the same + // section so we are restricting it to jumptable references. + MCSymbol *Label = Ctx.CreateTempSymbol(); + AsmPrinter.OutStreamer.EmitAssignment(Label, Expr); + Expr = MCSymbolRefExpr::Create(Label, Ctx); + } + break; + } + + if (Expr == 0) + Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx); + + if (!MO.isJTI() && !MO.isMBB() && MO.getOffset()) + Expr = MCBinaryExpr::CreateAdd(Expr, + MCConstantExpr::Create(MO.getOffset(), Ctx), + Ctx); + return MCOperand::CreateExpr(Expr); +} + + + +static void lower_subreg32(MCInst *MI, unsigned OpNo) { + // Convert registers in the addr mode according to subreg32. + unsigned Reg = MI->getOperand(OpNo).getReg(); + if (Reg != 0) + MI->getOperand(OpNo).setReg(getX86SubSuperRegister(Reg, MVT::i32)); +} + +static void lower_lea64_32mem(MCInst *MI, unsigned OpNo) { + // Convert registers in the addr mode according to subreg64. + for (unsigned i = 0; i != 4; ++i) { + if (!MI->getOperand(OpNo+i).isReg()) continue; + + unsigned Reg = MI->getOperand(OpNo+i).getReg(); + // LEAs can use RIP-relative addressing, and RIP has no sub/super register. + if (Reg == 0 || Reg == X86::RIP) continue; + + MI->getOperand(OpNo+i).setReg(getX86SubSuperRegister(Reg, MVT::i64)); + } +} + +/// LowerSubReg32_Op0 - Things like MOVZX16rr8 -> MOVZX32rr8. +static void LowerSubReg32_Op0(MCInst &OutMI, unsigned NewOpc) { + OutMI.setOpcode(NewOpc); + lower_subreg32(&OutMI, 0); +} +/// LowerUnaryToTwoAddr - R = setb -> R = sbb R, R +static void LowerUnaryToTwoAddr(MCInst &OutMI, unsigned NewOpc) { + OutMI.setOpcode(NewOpc); + OutMI.addOperand(OutMI.getOperand(0)); + OutMI.addOperand(OutMI.getOperand(0)); +} + +/// \brief Simplify FOO $imm, %{al,ax,eax,rax} to FOO $imm, for instruction with +/// a short fixed-register form. +static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) { + unsigned ImmOp = Inst.getNumOperands() - 1; + assert(Inst.getOperand(0).isReg() && + (Inst.getOperand(ImmOp).isImm() || Inst.getOperand(ImmOp).isExpr()) && + ((Inst.getNumOperands() == 3 && Inst.getOperand(1).isReg() && + Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) || + Inst.getNumOperands() == 2) && "Unexpected instruction!"); + + // Check whether the destination register can be fixed. + unsigned Reg = Inst.getOperand(0).getReg(); + if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX) + return; + + // If so, rewrite the instruction. + MCOperand Saved = Inst.getOperand(ImmOp); + Inst = MCInst(); + Inst.setOpcode(Opcode); + Inst.addOperand(Saved); +} + +/// \brief Simplify things like MOV32rm to MOV32o32a. +static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst, + unsigned Opcode) { + // Don't make these simplifications in 64-bit mode; other assemblers don't + // perform them because they make the code larger. + if (Printer.getSubtarget().is64Bit()) + return; + + bool IsStore = Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg(); + unsigned AddrBase = IsStore; + unsigned RegOp = IsStore ? 0 : 5; + unsigned AddrOp = AddrBase + 3; + assert(Inst.getNumOperands() == 6 && Inst.getOperand(RegOp).isReg() && + Inst.getOperand(AddrBase + 0).isReg() && // base + Inst.getOperand(AddrBase + 1).isImm() && // scale + Inst.getOperand(AddrBase + 2).isReg() && // index register + (Inst.getOperand(AddrOp).isExpr() || // address + Inst.getOperand(AddrOp).isImm())&& + Inst.getOperand(AddrBase + 4).isReg() && // segment + "Unexpected instruction!"); + + // Check whether the destination register can be fixed. + unsigned Reg = Inst.getOperand(RegOp).getReg(); + if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX) + return; + + // Check whether this is an absolute address. + // FIXME: We know TLVP symbol refs aren't, but there should be a better way + // to do this here. + bool Absolute = true; + if (Inst.getOperand(AddrOp).isExpr()) { + const MCExpr *MCE = Inst.getOperand(AddrOp).getExpr(); + if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(MCE)) + if (SRE->getKind() == MCSymbolRefExpr::VK_TLVP) + Absolute = false; + } + + if (Absolute && + (Inst.getOperand(AddrBase + 0).getReg() != 0 || + Inst.getOperand(AddrBase + 2).getReg() != 0 || + Inst.getOperand(AddrBase + 4).getReg() != 0 || + Inst.getOperand(AddrBase + 1).getImm() != 1)) + return; + + // If so, rewrite the instruction. + MCOperand Saved = Inst.getOperand(AddrOp); + Inst = MCInst(); + Inst.setOpcode(Opcode); + Inst.addOperand(Saved); +} + +void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { + OutMI.setOpcode(MI->getOpcode()); + + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + + MCOperand MCOp; + switch (MO.getType()) { + default: + MI->dump(); + llvm_unreachable("unknown operand type"); + case MachineOperand::MO_Register: + // Ignore all implicit register operands. + if (MO.isImplicit()) continue; + MCOp = MCOperand::CreateReg(MO.getReg()); + break; + case MachineOperand::MO_Immediate: + MCOp = MCOperand::CreateImm(MO.getImm()); + break; + case MachineOperand::MO_MachineBasicBlock: + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_ExternalSymbol: + MCOp = LowerSymbolOperand(MO, GetSymbolFromOperand(MO)); + break; + case MachineOperand::MO_JumpTableIndex: + MCOp = LowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex())); + break; + case MachineOperand::MO_ConstantPoolIndex: + MCOp = LowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex())); + break; + case MachineOperand::MO_BlockAddress: + MCOp = LowerSymbolOperand(MO, + AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress())); + break; + case MachineOperand::MO_RegisterMask: + // Ignore call clobbers. + continue; + } + + OutMI.addOperand(MCOp); + } + + // Handle a few special cases to eliminate operand modifiers. +ReSimplify: + switch (OutMI.getOpcode()) { + case X86::LEA64_32r: // Handle 'subreg rewriting' for the lea64_32mem operand. + lower_lea64_32mem(&OutMI, 1); + // FALL THROUGH. + case X86::LEA64r: + case X86::LEA16r: + case X86::LEA32r: + // LEA should have a segment register, but it must be empty. + assert(OutMI.getNumOperands() == 1+X86::AddrNumOperands && + "Unexpected # of LEA operands"); + assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 && + "LEA has segment specified!"); + break; + case X86::MOVZX64rr32: LowerSubReg32_Op0(OutMI, X86::MOV32rr); break; + case X86::MOVZX64rm32: LowerSubReg32_Op0(OutMI, X86::MOV32rm); break; + case X86::MOV64ri64i32: LowerSubReg32_Op0(OutMI, X86::MOV32ri); break; + case X86::MOVZX64rr8: LowerSubReg32_Op0(OutMI, X86::MOVZX32rr8); break; + case X86::MOVZX64rm8: LowerSubReg32_Op0(OutMI, X86::MOVZX32rm8); break; + case X86::MOVZX64rr16: LowerSubReg32_Op0(OutMI, X86::MOVZX32rr16); break; + case X86::MOVZX64rm16: LowerSubReg32_Op0(OutMI, X86::MOVZX32rm16); break; + case X86::MOV8r0: LowerUnaryToTwoAddr(OutMI, X86::XOR8rr); break; + case X86::MOV32r0: LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break; + + case X86::MOV16r0: + LowerSubReg32_Op0(OutMI, X86::MOV32r0); // MOV16r0 -> MOV32r0 + LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr + break; + case X86::MOV64r0: + LowerSubReg32_Op0(OutMI, X86::MOV32r0); // MOV64r0 -> MOV32r0 + LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr + break; + + // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B + // if one of the registers is extended, but other isn't. + case X86::VMOVAPDrr: + case X86::VMOVAPDYrr: + case X86::VMOVAPSrr: + case X86::VMOVAPSYrr: + case X86::VMOVDQArr: + case X86::VMOVDQAYrr: + case X86::VMOVDQUrr: + case X86::VMOVDQUYrr: + case X86::VMOVUPDrr: + case X86::VMOVUPDYrr: + case X86::VMOVUPSrr: + case X86::VMOVUPSYrr: { + if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) && + X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg())) { + unsigned NewOpc; + switch (OutMI.getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break; + case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break; + case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break; + case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break; + case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break; + case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break; + case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break; + case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break; + case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break; + case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break; + case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break; + case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break; + } + OutMI.setOpcode(NewOpc); + } + break; + } + case X86::VMOVSDrr: + case X86::VMOVSSrr: { + if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) && + X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) { + unsigned NewOpc; + switch (OutMI.getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV; break; + case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV; break; + } + OutMI.setOpcode(NewOpc); + } + break; + } + + // TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have register + // inputs modeled as normal uses instead of implicit uses. As such, truncate + // off all but the first operand (the callee). FIXME: Change isel. + case X86::TAILJMPr64: + case X86::CALL64r: + case X86::CALL64pcrel32: { + unsigned Opcode = OutMI.getOpcode(); + MCOperand Saved = OutMI.getOperand(0); + OutMI = MCInst(); + OutMI.setOpcode(Opcode); + OutMI.addOperand(Saved); + break; + } + + case X86::EH_RETURN: + case X86::EH_RETURN64: { + OutMI = MCInst(); + OutMI.setOpcode(X86::RET); + break; + } + + // TAILJMPd, TAILJMPd64 - Lower to the correct jump instructions. + case X86::TAILJMPr: + case X86::TAILJMPd: + case X86::TAILJMPd64: { + unsigned Opcode; + switch (OutMI.getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::TAILJMPr: Opcode = X86::JMP32r; break; + case X86::TAILJMPd: + case X86::TAILJMPd64: Opcode = X86::JMP_1; break; + } + + MCOperand Saved = OutMI.getOperand(0); + OutMI = MCInst(); + OutMI.setOpcode(Opcode); + OutMI.addOperand(Saved); + break; + } + + // These are pseudo-ops for OR to help with the OR->ADD transformation. We do + // this with an ugly goto in case the resultant OR uses EAX and needs the + // short form. + case X86::ADD16rr_DB: OutMI.setOpcode(X86::OR16rr); goto ReSimplify; + case X86::ADD32rr_DB: OutMI.setOpcode(X86::OR32rr); goto ReSimplify; + case X86::ADD64rr_DB: OutMI.setOpcode(X86::OR64rr); goto ReSimplify; + case X86::ADD16ri_DB: OutMI.setOpcode(X86::OR16ri); goto ReSimplify; + case X86::ADD32ri_DB: OutMI.setOpcode(X86::OR32ri); goto ReSimplify; + case X86::ADD64ri32_DB: OutMI.setOpcode(X86::OR64ri32); goto ReSimplify; + case X86::ADD16ri8_DB: OutMI.setOpcode(X86::OR16ri8); goto ReSimplify; + case X86::ADD32ri8_DB: OutMI.setOpcode(X86::OR32ri8); goto ReSimplify; + case X86::ADD64ri8_DB: OutMI.setOpcode(X86::OR64ri8); goto ReSimplify; + + // The assembler backend wants to see branches in their small form and relax + // them to their large form. The JIT can only handle the large form because + // it does not do relaxation. For now, translate the large form to the + // small one here. + case X86::JMP_4: OutMI.setOpcode(X86::JMP_1); break; + case X86::JO_4: OutMI.setOpcode(X86::JO_1); break; + case X86::JNO_4: OutMI.setOpcode(X86::JNO_1); break; + case X86::JB_4: OutMI.setOpcode(X86::JB_1); break; + case X86::JAE_4: OutMI.setOpcode(X86::JAE_1); break; + case X86::JE_4: OutMI.setOpcode(X86::JE_1); break; + case X86::JNE_4: OutMI.setOpcode(X86::JNE_1); break; + case X86::JBE_4: OutMI.setOpcode(X86::JBE_1); break; + case X86::JA_4: OutMI.setOpcode(X86::JA_1); break; + case X86::JS_4: OutMI.setOpcode(X86::JS_1); break; + case X86::JNS_4: OutMI.setOpcode(X86::JNS_1); break; + case X86::JP_4: OutMI.setOpcode(X86::JP_1); break; + case X86::JNP_4: OutMI.setOpcode(X86::JNP_1); break; + case X86::JL_4: OutMI.setOpcode(X86::JL_1); break; + case X86::JGE_4: OutMI.setOpcode(X86::JGE_1); break; + case X86::JLE_4: OutMI.setOpcode(X86::JLE_1); break; + case X86::JG_4: OutMI.setOpcode(X86::JG_1); break; + + // Atomic load and store require a separate pseudo-inst because Acquire + // implies mayStore and Release implies mayLoad; fix these to regular MOV + // instructions here + case X86::ACQUIRE_MOV8rm: OutMI.setOpcode(X86::MOV8rm); goto ReSimplify; + case X86::ACQUIRE_MOV16rm: OutMI.setOpcode(X86::MOV16rm); goto ReSimplify; + case X86::ACQUIRE_MOV32rm: OutMI.setOpcode(X86::MOV32rm); goto ReSimplify; + case X86::ACQUIRE_MOV64rm: OutMI.setOpcode(X86::MOV64rm); goto ReSimplify; + case X86::RELEASE_MOV8mr: OutMI.setOpcode(X86::MOV8mr); goto ReSimplify; + case X86::RELEASE_MOV16mr: OutMI.setOpcode(X86::MOV16mr); goto ReSimplify; + case X86::RELEASE_MOV32mr: OutMI.setOpcode(X86::MOV32mr); goto ReSimplify; + case X86::RELEASE_MOV64mr: OutMI.setOpcode(X86::MOV64mr); goto ReSimplify; + + // We don't currently select the correct instruction form for instructions + // which have a short %eax, etc. form. Handle this by custom lowering, for + // now. + // + // Note, we are currently not handling the following instructions: + // MOV64ao8, MOV64o8a + // XCHG16ar, XCHG32ar, XCHG64ar + case X86::MOV8mr_NOREX: + case X86::MOV8mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8ao8); break; + case X86::MOV8rm_NOREX: + case X86::MOV8rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8o8a); break; + case X86::MOV16mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16ao16); break; + case X86::MOV16rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16o16a); break; + case X86::MOV32mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32ao32); break; + case X86::MOV32rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32o32a); break; + + case X86::ADC8ri: SimplifyShortImmForm(OutMI, X86::ADC8i8); break; + case X86::ADC16ri: SimplifyShortImmForm(OutMI, X86::ADC16i16); break; + case X86::ADC32ri: SimplifyShortImmForm(OutMI, X86::ADC32i32); break; + case X86::ADC64ri32: SimplifyShortImmForm(OutMI, X86::ADC64i32); break; + case X86::ADD8ri: SimplifyShortImmForm(OutMI, X86::ADD8i8); break; + case X86::ADD16ri: SimplifyShortImmForm(OutMI, X86::ADD16i16); break; + case X86::ADD32ri: SimplifyShortImmForm(OutMI, X86::ADD32i32); break; + case X86::ADD64ri32: SimplifyShortImmForm(OutMI, X86::ADD64i32); break; + case X86::AND8ri: SimplifyShortImmForm(OutMI, X86::AND8i8); break; + case X86::AND16ri: SimplifyShortImmForm(OutMI, X86::AND16i16); break; + case X86::AND32ri: SimplifyShortImmForm(OutMI, X86::AND32i32); break; + case X86::AND64ri32: SimplifyShortImmForm(OutMI, X86::AND64i32); break; + case X86::CMP8ri: SimplifyShortImmForm(OutMI, X86::CMP8i8); break; + case X86::CMP16ri: SimplifyShortImmForm(OutMI, X86::CMP16i16); break; + case X86::CMP32ri: SimplifyShortImmForm(OutMI, X86::CMP32i32); break; + case X86::CMP64ri32: SimplifyShortImmForm(OutMI, X86::CMP64i32); break; + case X86::OR8ri: SimplifyShortImmForm(OutMI, X86::OR8i8); break; + case X86::OR16ri: SimplifyShortImmForm(OutMI, X86::OR16i16); break; + case X86::OR32ri: SimplifyShortImmForm(OutMI, X86::OR32i32); break; + case X86::OR64ri32: SimplifyShortImmForm(OutMI, X86::OR64i32); break; + case X86::SBB8ri: SimplifyShortImmForm(OutMI, X86::SBB8i8); break; + case X86::SBB16ri: SimplifyShortImmForm(OutMI, X86::SBB16i16); break; + case X86::SBB32ri: SimplifyShortImmForm(OutMI, X86::SBB32i32); break; + case X86::SBB64ri32: SimplifyShortImmForm(OutMI, X86::SBB64i32); break; + case X86::SUB8ri: SimplifyShortImmForm(OutMI, X86::SUB8i8); break; + case X86::SUB16ri: SimplifyShortImmForm(OutMI, X86::SUB16i16); break; + case X86::SUB32ri: SimplifyShortImmForm(OutMI, X86::SUB32i32); break; + case X86::SUB64ri32: SimplifyShortImmForm(OutMI, X86::SUB64i32); break; + case X86::TEST8ri: SimplifyShortImmForm(OutMI, X86::TEST8i8); break; + case X86::TEST16ri: SimplifyShortImmForm(OutMI, X86::TEST16i16); break; + case X86::TEST32ri: SimplifyShortImmForm(OutMI, X86::TEST32i32); break; + case X86::TEST64ri32: SimplifyShortImmForm(OutMI, X86::TEST64i32); break; + case X86::XOR8ri: SimplifyShortImmForm(OutMI, X86::XOR8i8); break; + case X86::XOR16ri: SimplifyShortImmForm(OutMI, X86::XOR16i16); break; + case X86::XOR32ri: SimplifyShortImmForm(OutMI, X86::XOR32i32); break; + case X86::XOR64ri32: SimplifyShortImmForm(OutMI, X86::XOR64i32); break; + + case X86::MORESTACK_RET: + OutMI.setOpcode(X86::RET); + break; + + case X86::MORESTACK_RET_RESTORE_R10: + OutMI.setOpcode(X86::MOV64rr); + OutMI.addOperand(MCOperand::CreateReg(X86::R10)); + OutMI.addOperand(MCOperand::CreateReg(X86::RAX)); + + AsmPrinter.OutStreamer.EmitInstruction(MCInstBuilder(X86::RET)); + break; + } +} + +static void LowerTlsAddr(MCStreamer &OutStreamer, + X86MCInstLower &MCInstLowering, + const MachineInstr &MI) { + + bool is64Bits = MI.getOpcode() == X86::TLS_addr64 || + MI.getOpcode() == X86::TLS_base_addr64; + + bool needsPadding = MI.getOpcode() == X86::TLS_addr64; + + MCContext &context = OutStreamer.getContext(); + + if (needsPadding) + OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX)); + + MCSymbolRefExpr::VariantKind SRVK; + switch (MI.getOpcode()) { + case X86::TLS_addr32: + case X86::TLS_addr64: + SRVK = MCSymbolRefExpr::VK_TLSGD; + break; + case X86::TLS_base_addr32: + SRVK = MCSymbolRefExpr::VK_TLSLDM; + break; + case X86::TLS_base_addr64: + SRVK = MCSymbolRefExpr::VK_TLSLD; + break; + default: + llvm_unreachable("unexpected opcode"); + } + + MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3)); + const MCSymbolRefExpr *symRef = MCSymbolRefExpr::Create(sym, SRVK, context); + + MCInst LEA; + if (is64Bits) { + LEA.setOpcode(X86::LEA64r); + LEA.addOperand(MCOperand::CreateReg(X86::RDI)); // dest + LEA.addOperand(MCOperand::CreateReg(X86::RIP)); // base + LEA.addOperand(MCOperand::CreateImm(1)); // scale + LEA.addOperand(MCOperand::CreateReg(0)); // index + LEA.addOperand(MCOperand::CreateExpr(symRef)); // disp + LEA.addOperand(MCOperand::CreateReg(0)); // seg + } else if (SRVK == MCSymbolRefExpr::VK_TLSLDM) { + LEA.setOpcode(X86::LEA32r); + LEA.addOperand(MCOperand::CreateReg(X86::EAX)); // dest + LEA.addOperand(MCOperand::CreateReg(X86::EBX)); // base + LEA.addOperand(MCOperand::CreateImm(1)); // scale + LEA.addOperand(MCOperand::CreateReg(0)); // index + LEA.addOperand(MCOperand::CreateExpr(symRef)); // disp + LEA.addOperand(MCOperand::CreateReg(0)); // seg + } else { + LEA.setOpcode(X86::LEA32r); + LEA.addOperand(MCOperand::CreateReg(X86::EAX)); // dest + LEA.addOperand(MCOperand::CreateReg(0)); // base + LEA.addOperand(MCOperand::CreateImm(1)); // scale + LEA.addOperand(MCOperand::CreateReg(X86::EBX)); // index + LEA.addOperand(MCOperand::CreateExpr(symRef)); // disp + LEA.addOperand(MCOperand::CreateReg(0)); // seg + } + OutStreamer.EmitInstruction(LEA); + + if (needsPadding) { + OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX)); + OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX)); + OutStreamer.EmitInstruction(MCInstBuilder(X86::REX64_PREFIX)); + } + + StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr"; + MCSymbol *tlsGetAddr = context.GetOrCreateSymbol(name); + const MCSymbolRefExpr *tlsRef = + MCSymbolRefExpr::Create(tlsGetAddr, + MCSymbolRefExpr::VK_PLT, + context); + + OutStreamer.EmitInstruction(MCInstBuilder(is64Bits ? X86::CALL64pcrel32 + : X86::CALLpcrel32) + .addExpr(tlsRef)); +} + +void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { + X86MCInstLower MCInstLowering(Mang, *MF, *this); + switch (MI->getOpcode()) { + case TargetOpcode::DBG_VALUE: + if (isVerbose() && OutStreamer.hasRawTextSupport()) { + std::string TmpStr; + raw_string_ostream OS(TmpStr); + PrintDebugValueComment(MI, OS); + OutStreamer.EmitRawText(StringRef(OS.str())); + } + return; + + // Emit nothing here but a comment if we can. + case X86::Int_MemBarrier: + if (OutStreamer.hasRawTextSupport()) + OutStreamer.EmitRawText(StringRef("\t#MEMBARRIER")); + return; + + + case X86::EH_RETURN: + case X86::EH_RETURN64: { + // Lower these as normal, but add some comments. + unsigned Reg = MI->getOperand(0).getReg(); + OutStreamer.AddComment(StringRef("eh_return, addr: %") + + X86ATTInstPrinter::getRegisterName(Reg)); + break; + } + case X86::TAILJMPr: + case X86::TAILJMPd: + case X86::TAILJMPd64: + // Lower these as normal, but add some comments. + OutStreamer.AddComment("TAILCALL"); + break; + + case X86::TLS_addr32: + case X86::TLS_addr64: + case X86::TLS_base_addr32: + case X86::TLS_base_addr64: + return LowerTlsAddr(OutStreamer, MCInstLowering, *MI); + + case X86::MOVPC32r: { + // This is a pseudo op for a two instruction sequence with a label, which + // looks like: + // call "L1$pb" + // "L1$pb": + // popl %esi + + // Emit the call. + MCSymbol *PICBase = MF->getPICBaseSymbol(); + // FIXME: We would like an efficient form for this, so we don't have to do a + // lot of extra uniquing. + OutStreamer.EmitInstruction(MCInstBuilder(X86::CALLpcrel32) + .addExpr(MCSymbolRefExpr::Create(PICBase, OutContext))); + + // Emit the label. + OutStreamer.EmitLabel(PICBase); + + // popl $reg + OutStreamer.EmitInstruction(MCInstBuilder(X86::POP32r) + .addReg(MI->getOperand(0).getReg())); + return; + } + + case X86::ADD32ri: { + // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri. + if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS) + break; + + // Okay, we have something like: + // EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL) + + // For this, we want to print something like: + // MYGLOBAL + (. - PICBASE) + // However, we can't generate a ".", so just emit a new label here and refer + // to it. + MCSymbol *DotSym = OutContext.CreateTempSymbol(); + OutStreamer.EmitLabel(DotSym); + + // Now that we have emitted the label, lower the complex operand expression. + MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2)); + + const MCExpr *DotExpr = MCSymbolRefExpr::Create(DotSym, OutContext); + const MCExpr *PICBase = + MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), OutContext); + DotExpr = MCBinaryExpr::CreateSub(DotExpr, PICBase, OutContext); + + DotExpr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(OpSym,OutContext), + DotExpr, OutContext); + + OutStreamer.EmitInstruction(MCInstBuilder(X86::ADD32ri) + .addReg(MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addExpr(DotExpr)); + return; + } + } + + MCInst TmpInst; + MCInstLowering.Lower(MI, TmpInst); + OutStreamer.EmitInstruction(TmpInst); +} diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp new file mode 100644 index 0000000..568dc22 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp @@ -0,0 +1,14 @@ +//===-- X86MachineFuctionInfo.cpp - X86 machine function info -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "X86MachineFunctionInfo.h" + +using namespace llvm; + +void X86MachineFunctionInfo::anchor() { } diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h new file mode 100644 index 0000000..78d20ce --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -0,0 +1,145 @@ +//===-- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares X86-specific per-machine-function information. +// +//===----------------------------------------------------------------------===// + +#ifndef X86MACHINEFUNCTIONINFO_H +#define X86MACHINEFUNCTIONINFO_H + +#include "llvm/CodeGen/MachineFunction.h" + +namespace llvm { + +/// X86MachineFunctionInfo - This class is derived from MachineFunction and +/// contains private X86 target-specific information for each MachineFunction. +class X86MachineFunctionInfo : public MachineFunctionInfo { + virtual void anchor(); + + /// ForceFramePointer - True if the function is required to use of frame + /// pointer for reasons other than it containing dynamic allocation or + /// that FP eliminatation is turned off. For example, Cygwin main function + /// contains stack pointer re-alignment code which requires FP. + bool ForceFramePointer; + + /// CalleeSavedFrameSize - Size of the callee-saved register portion of the + /// stack frame in bytes. + unsigned CalleeSavedFrameSize; + + /// BytesToPopOnReturn - Number of bytes function pops on return (in addition + /// to the space used by the return address). + /// Used on windows platform for stdcall & fastcall name decoration + unsigned BytesToPopOnReturn; + + /// ReturnAddrIndex - FrameIndex for return slot. + int ReturnAddrIndex; + + /// TailCallReturnAddrDelta - The number of bytes by which return address + /// stack slot is moved as the result of tail call optimization. + int TailCallReturnAddrDelta; + + /// SRetReturnReg - Some subtargets require that sret lowering includes + /// returning the value of the returned struct in a register. This field + /// holds the virtual register into which the sret argument is passed. + unsigned SRetReturnReg; + + /// GlobalBaseReg - keeps track of the virtual register initialized for + /// use as the global base register. This is used for PIC in some PIC + /// relocation models. + unsigned GlobalBaseReg; + + /// VarArgsFrameIndex - FrameIndex for start of varargs area. + int VarArgsFrameIndex; + /// RegSaveFrameIndex - X86-64 vararg func register save area. + int RegSaveFrameIndex; + /// VarArgsGPOffset - X86-64 vararg func int reg offset. + unsigned VarArgsGPOffset; + /// VarArgsFPOffset - X86-64 vararg func fp reg offset. + unsigned VarArgsFPOffset; + /// ArgumentStackSize - The number of bytes on stack consumed by the arguments + /// being passed on the stack. + unsigned ArgumentStackSize; + /// NumLocalDynamics - Number of local-dynamic TLS accesses. + unsigned NumLocalDynamics; + +public: + X86MachineFunctionInfo() : ForceFramePointer(false), + CalleeSavedFrameSize(0), + BytesToPopOnReturn(0), + ReturnAddrIndex(0), + TailCallReturnAddrDelta(0), + SRetReturnReg(0), + GlobalBaseReg(0), + VarArgsFrameIndex(0), + RegSaveFrameIndex(0), + VarArgsGPOffset(0), + VarArgsFPOffset(0), + ArgumentStackSize(0), + NumLocalDynamics(0) {} + + explicit X86MachineFunctionInfo(MachineFunction &MF) + : ForceFramePointer(false), + CalleeSavedFrameSize(0), + BytesToPopOnReturn(0), + ReturnAddrIndex(0), + TailCallReturnAddrDelta(0), + SRetReturnReg(0), + GlobalBaseReg(0), + VarArgsFrameIndex(0), + RegSaveFrameIndex(0), + VarArgsGPOffset(0), + VarArgsFPOffset(0), + ArgumentStackSize(0), + NumLocalDynamics(0) {} + + bool getForceFramePointer() const { return ForceFramePointer;} + void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; } + + unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; } + void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; } + + unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; } + void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;} + + int getRAIndex() const { return ReturnAddrIndex; } + void setRAIndex(int Index) { ReturnAddrIndex = Index; } + + int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; } + void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;} + + unsigned getSRetReturnReg() const { return SRetReturnReg; } + void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; } + + unsigned getGlobalBaseReg() const { return GlobalBaseReg; } + void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; } + + int getVarArgsFrameIndex() const { return VarArgsFrameIndex; } + void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; } + + int getRegSaveFrameIndex() const { return RegSaveFrameIndex; } + void setRegSaveFrameIndex(int Idx) { RegSaveFrameIndex = Idx; } + + unsigned getVarArgsGPOffset() const { return VarArgsGPOffset; } + void setVarArgsGPOffset(unsigned Offset) { VarArgsGPOffset = Offset; } + + unsigned getVarArgsFPOffset() const { return VarArgsFPOffset; } + void setVarArgsFPOffset(unsigned Offset) { VarArgsFPOffset = Offset; } + + unsigned getArgumentStackSize() const { return ArgumentStackSize; } + void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; } + + unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; } + void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; } + +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp new file mode 100644 index 0000000..83e75ea --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp @@ -0,0 +1,212 @@ +//===-------- X86PadShortFunction.cpp - pad short functions -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass which will pad short functions to prevent +// a stall if a function returns before the return address is ready. This +// is needed for some Intel Atom processors. +// +//===----------------------------------------------------------------------===// + +#include <algorithm> + +#define DEBUG_TYPE "x86-pad-short-functions" +#include "X86.h" +#include "X86InstrInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +STATISTIC(NumBBsPadded, "Number of basic blocks padded"); + +namespace { + struct VisitedBBInfo { + // HasReturn - Whether the BB contains a return instruction + bool HasReturn; + + // Cycles - Number of cycles until return if HasReturn is true, otherwise + // number of cycles until end of the BB + unsigned int Cycles; + + VisitedBBInfo() : HasReturn(false), Cycles(0) {} + VisitedBBInfo(bool HasReturn, unsigned int Cycles) + : HasReturn(HasReturn), Cycles(Cycles) {} + }; + + struct PadShortFunc : public MachineFunctionPass { + static char ID; + PadShortFunc() : MachineFunctionPass(ID) + , Threshold(4), TM(0), TII(0) {} + + virtual bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { + return "X86 Atom pad short functions"; + } + + private: + void findReturns(MachineBasicBlock *MBB, + unsigned int Cycles = 0); + + bool cyclesUntilReturn(MachineBasicBlock *MBB, + unsigned int &Cycles); + + void addPadding(MachineBasicBlock *MBB, + MachineBasicBlock::iterator &MBBI, + unsigned int NOOPsToAdd); + + const unsigned int Threshold; + + // ReturnBBs - Maps basic blocks that return to the minimum number of + // cycles until the return, starting from the entry block. + DenseMap<MachineBasicBlock*, unsigned int> ReturnBBs; + + // VisitedBBs - Cache of previously visited BBs. + DenseMap<MachineBasicBlock*, VisitedBBInfo> VisitedBBs; + + const TargetMachine *TM; + const TargetInstrInfo *TII; + }; + + char PadShortFunc::ID = 0; +} + +FunctionPass *llvm::createX86PadShortFunctions() { + return new PadShortFunc(); +} + +/// runOnMachineFunction - Loop over all of the basic blocks, inserting +/// NOOP instructions before early exits. +bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) { + const AttributeSet &FnAttrs = MF.getFunction()->getAttributes(); + if (FnAttrs.hasAttribute(AttributeSet::FunctionIndex, + Attribute::OptimizeForSize) || + FnAttrs.hasAttribute(AttributeSet::FunctionIndex, + Attribute::MinSize)) { + return false; + } + + TM = &MF.getTarget(); + TII = TM->getInstrInfo(); + + // Search through basic blocks and mark the ones that have early returns + ReturnBBs.clear(); + VisitedBBs.clear(); + findReturns(MF.begin()); + + bool MadeChange = false; + + MachineBasicBlock *MBB; + unsigned int Cycles = 0; + + // Pad the identified basic blocks with NOOPs + for (DenseMap<MachineBasicBlock*, unsigned int>::iterator I = ReturnBBs.begin(); + I != ReturnBBs.end(); ++I) { + MBB = I->first; + Cycles = I->second; + + if (Cycles < Threshold) { + // BB ends in a return. Skip over any DBG_VALUE instructions + // trailing the terminator. + assert(MBB->size() > 0 && + "Basic block should contain at least a RET but is empty"); + MachineBasicBlock::iterator ReturnLoc = --MBB->end(); + + while (ReturnLoc->isDebugValue()) + --ReturnLoc; + assert(ReturnLoc->isReturn() && !ReturnLoc->isCall() && + "Basic block does not end with RET"); + + addPadding(MBB, ReturnLoc, Threshold - Cycles); + NumBBsPadded++; + MadeChange = true; + } + } + + return MadeChange; +} + +/// findReturn - Starting at MBB, follow control flow and add all +/// basic blocks that contain a return to ReturnBBs. +void PadShortFunc::findReturns(MachineBasicBlock *MBB, unsigned int Cycles) { + // If this BB has a return, note how many cycles it takes to get there. + bool hasReturn = cyclesUntilReturn(MBB, Cycles); + if (Cycles >= Threshold) + return; + + if (hasReturn) { + ReturnBBs[MBB] = std::max(ReturnBBs[MBB], Cycles); + return; + } + + // Follow branches in BB and look for returns + for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(); + I != MBB->succ_end(); ++I) { + if (*I == MBB) + continue; + findReturns(*I, Cycles); + } +} + +/// cyclesUntilReturn - return true if the MBB has a return instruction, +/// and return false otherwise. +/// Cycles will be incremented by the number of cycles taken to reach the +/// return or the end of the BB, whichever occurs first. +bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB, + unsigned int &Cycles) { + // Return cached result if BB was previously visited + DenseMap<MachineBasicBlock*, VisitedBBInfo>::iterator it + = VisitedBBs.find(MBB); + if (it != VisitedBBs.end()) { + VisitedBBInfo BBInfo = it->second; + Cycles += BBInfo.Cycles; + return BBInfo.HasReturn; + } + + unsigned int CyclesToEnd = 0; + + for (MachineBasicBlock::iterator MBBI = MBB->begin(); + MBBI != MBB->end(); ++MBBI) { + MachineInstr *MI = MBBI; + // Mark basic blocks with a return instruction. Calls to other + // functions do not count because the called function will be padded, + // if necessary. + if (MI->isReturn() && !MI->isCall()) { + VisitedBBs[MBB] = VisitedBBInfo(true, CyclesToEnd); + Cycles += CyclesToEnd; + return true; + } + + CyclesToEnd += TII->getInstrLatency(TM->getInstrItineraryData(), MI); + } + + VisitedBBs[MBB] = VisitedBBInfo(false, CyclesToEnd); + Cycles += CyclesToEnd; + return false; +} + +/// addPadding - Add the given number of NOOP instructions to the function +/// just prior to the return at MBBI +void PadShortFunc::addPadding(MachineBasicBlock *MBB, + MachineBasicBlock::iterator &MBBI, + unsigned int NOOPsToAdd) { + DebugLoc DL = MBBI->getDebugLoc(); + + while (NOOPsToAdd-- > 0) { + BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP)); + BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP)); + } +} diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp new file mode 100644 index 0000000..16886e4 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -0,0 +1,691 @@ +//===-- X86RegisterInfo.cpp - X86 Register Information --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of the TargetRegisterInfo class. +// This file is responsible for the frame pointer elimination optimization +// on X86. +// +//===----------------------------------------------------------------------===// + +#include "X86RegisterInfo.h" +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86MachineFunctionInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Type.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" + +#define GET_REGINFO_TARGET_DESC +#include "X86GenRegisterInfo.inc" + +using namespace llvm; + +cl::opt<bool> +ForceStackAlign("force-align-stack", + cl::desc("Force align the stack to the minimum alignment" + " needed for the function."), + cl::init(false), cl::Hidden); + +static cl::opt<bool> +EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true), + cl::desc("Enable use of a base pointer for complex stack frames")); + +X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm, + const TargetInstrInfo &tii) + : X86GenRegisterInfo((tm.getSubtarget<X86Subtarget>().is64Bit() + ? X86::RIP : X86::EIP), + X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), false), + X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), true), + (tm.getSubtarget<X86Subtarget>().is64Bit() + ? X86::RIP : X86::EIP)), + TM(tm), TII(tii) { + X86_MC::InitLLVM2SEHRegisterMapping(this); + + // Cache some information. + const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); + Is64Bit = Subtarget->is64Bit(); + IsWin64 = Subtarget->isTargetWin64(); + + if (Is64Bit) { + SlotSize = 8; + StackPtr = X86::RSP; + FramePtr = X86::RBP; + } else { + SlotSize = 4; + StackPtr = X86::ESP; + FramePtr = X86::EBP; + } + // Use a callee-saved register as the base pointer. These registers must + // not conflict with any ABI requirements. For example, in 32-bit mode PIC + // requires GOT in the EBX register before function calls via PLT GOT pointer. + BasePtr = Is64Bit ? X86::RBX : X86::ESI; +} + +/// getCompactUnwindRegNum - This function maps the register to the number for +/// compact unwind encoding. Return -1 if the register isn't valid. +int X86RegisterInfo::getCompactUnwindRegNum(unsigned RegNum, bool isEH) const { + switch (getLLVMRegNum(RegNum, isEH)) { + case X86::EBX: case X86::RBX: return 1; + case X86::ECX: case X86::R12: return 2; + case X86::EDX: case X86::R13: return 3; + case X86::EDI: case X86::R14: return 4; + case X86::ESI: case X86::R15: return 5; + case X86::EBP: case X86::RBP: return 6; + } + + return -1; +} + +bool +X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { + // Only enable when post-RA scheduling is enabled and this is needed. + return TM.getSubtargetImpl()->postRAScheduler(); +} + +int +X86RegisterInfo::getSEHRegNum(unsigned i) const { + return getEncodingValue(i); +} + +const TargetRegisterClass * +X86RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC, + unsigned Idx) const { + // The sub_8bit sub-register index is more constrained in 32-bit mode. + // It behaves just like the sub_8bit_hi index. + if (!Is64Bit && Idx == X86::sub_8bit) + Idx = X86::sub_8bit_hi; + + // Forward to TableGen's default version. + return X86GenRegisterInfo::getSubClassWithSubReg(RC, Idx); +} + +const TargetRegisterClass * +X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, + const TargetRegisterClass *B, + unsigned SubIdx) const { + // The sub_8bit sub-register index is more constrained in 32-bit mode. + if (!Is64Bit && SubIdx == X86::sub_8bit) { + A = X86GenRegisterInfo::getSubClassWithSubReg(A, X86::sub_8bit_hi); + if (!A) + return 0; + } + return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx); +} + +const TargetRegisterClass* +X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{ + // Don't allow super-classes of GR8_NOREX. This class is only used after + // extrating sub_8bit_hi sub-registers. The H sub-registers cannot be copied + // to the full GR8 register class in 64-bit mode, so we cannot allow the + // reigster class inflation. + // + // The GR8_NOREX class is always used in a way that won't be constrained to a + // sub-class, so sub-classes like GR8_ABCD_L are allowed to expand to the + // full GR8 class. + if (RC == &X86::GR8_NOREXRegClass) + return RC; + + const TargetRegisterClass *Super = RC; + TargetRegisterClass::sc_iterator I = RC->getSuperClasses(); + do { + switch (Super->getID()) { + case X86::GR8RegClassID: + case X86::GR16RegClassID: + case X86::GR32RegClassID: + case X86::GR64RegClassID: + case X86::FR32RegClassID: + case X86::FR64RegClassID: + case X86::RFP32RegClassID: + case X86::RFP64RegClassID: + case X86::RFP80RegClassID: + case X86::VR128RegClassID: + case X86::VR256RegClassID: + // Don't return a super-class that would shrink the spill size. + // That can happen with the vector and float classes. + if (Super->getSize() == RC->getSize()) + return Super; + } + Super = *I++; + } while (Super); + return RC; +} + +const TargetRegisterClass * +X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) + const { + const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); + switch (Kind) { + default: llvm_unreachable("Unexpected Kind in getPointerRegClass!"); + case 0: // Normal GPRs. + if (Subtarget.isTarget64BitLP64()) + return &X86::GR64RegClass; + return &X86::GR32RegClass; + case 1: // Normal GPRs except the stack pointer (for encoding reasons). + if (Subtarget.isTarget64BitLP64()) + return &X86::GR64_NOSPRegClass; + return &X86::GR32_NOSPRegClass; + case 2: // Available for tailcall (not callee-saved GPRs). + if (Subtarget.isTargetWin64()) + return &X86::GR64_TCW64RegClass; + else if (Subtarget.is64Bit()) + return &X86::GR64_TCRegClass; + + const Function *F = MF.getFunction(); + bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false); + if (hasHipeCC) + return &X86::GR32RegClass; + return &X86::GR32_TCRegClass; + } +} + +const TargetRegisterClass * +X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { + if (RC == &X86::CCRRegClass) { + if (Is64Bit) + return &X86::GR64RegClass; + else + return &X86::GR32RegClass; + } + return RC; +} + +unsigned +X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0; + switch (RC->getID()) { + default: + return 0; + case X86::GR32RegClassID: + return 4 - FPDiff; + case X86::GR64RegClassID: + return 12 - FPDiff; + case X86::VR128RegClassID: + return TM.getSubtarget<X86Subtarget>().is64Bit() ? 10 : 4; + case X86::VR64RegClassID: + return 4; + } +} + +const uint16_t * +X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + switch (MF->getFunction()->getCallingConv()) { + case CallingConv::GHC: + case CallingConv::HiPE: + return CSR_NoRegs_SaveList; + + case CallingConv::Intel_OCL_BI: { + bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + if (HasAVX && IsWin64) + return CSR_Win64_Intel_OCL_BI_AVX_SaveList; + if (HasAVX && Is64Bit) + return CSR_64_Intel_OCL_BI_AVX_SaveList; + if (!HasAVX && !IsWin64 && Is64Bit) + return CSR_64_Intel_OCL_BI_SaveList; + break; + } + + case CallingConv::Cold: + if (Is64Bit) + return CSR_MostRegs_64_SaveList; + break; + + default: + break; + } + + bool CallsEHReturn = MF->getMMI().callsEHReturn(); + if (Is64Bit) { + if (IsWin64) + return CSR_Win64_SaveList; + if (CallsEHReturn) + return CSR_64EHRet_SaveList; + return CSR_64_SaveList; + } + if (CallsEHReturn) + return CSR_32EHRet_SaveList; + return CSR_32_SaveList; +} + +const uint32_t* +X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { + bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + + if (CC == CallingConv::Intel_OCL_BI) { + if (IsWin64 && HasAVX) + return CSR_Win64_Intel_OCL_BI_AVX_RegMask; + if (Is64Bit && HasAVX) + return CSR_64_Intel_OCL_BI_AVX_RegMask; + if (!HasAVX && !IsWin64 && Is64Bit) + return CSR_64_Intel_OCL_BI_RegMask; + } + if (CC == CallingConv::GHC || CC == CallingConv::HiPE) + return CSR_NoRegs_RegMask; + if (!Is64Bit) + return CSR_32_RegMask; + if (CC == CallingConv::Cold) + return CSR_MostRegs_64_RegMask; + if (IsWin64) + return CSR_Win64_RegMask; + return CSR_64_RegMask; +} + +const uint32_t* +X86RegisterInfo::getNoPreservedMask() const { + return CSR_NoRegs_RegMask; +} + +BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { + BitVector Reserved(getNumRegs()); + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + // Set the stack-pointer register and its aliases as reserved. + Reserved.set(X86::RSP); + for (MCSubRegIterator I(X86::RSP, this); I.isValid(); ++I) + Reserved.set(*I); + + // Set the instruction pointer register and its aliases as reserved. + Reserved.set(X86::RIP); + for (MCSubRegIterator I(X86::RIP, this); I.isValid(); ++I) + Reserved.set(*I); + + // Set the frame-pointer register and its aliases as reserved if needed. + if (TFI->hasFP(MF)) { + Reserved.set(X86::RBP); + for (MCSubRegIterator I(X86::RBP, this); I.isValid(); ++I) + Reserved.set(*I); + } + + // Set the base-pointer register and its aliases as reserved if needed. + if (hasBasePointer(MF)) { + CallingConv::ID CC = MF.getFunction()->getCallingConv(); + const uint32_t* RegMask = getCallPreservedMask(CC); + if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister())) + report_fatal_error( + "Stack realignment in presence of dynamic allocas is not supported with" + "this calling convention."); + + Reserved.set(getBaseRegister()); + for (MCSubRegIterator I(getBaseRegister(), this); I.isValid(); ++I) + Reserved.set(*I); + } + + // Mark the segment registers as reserved. + Reserved.set(X86::CS); + Reserved.set(X86::SS); + Reserved.set(X86::DS); + Reserved.set(X86::ES); + Reserved.set(X86::FS); + Reserved.set(X86::GS); + + // Mark the floating point stack registers as reserved. + Reserved.set(X86::ST0); + Reserved.set(X86::ST1); + Reserved.set(X86::ST2); + Reserved.set(X86::ST3); + Reserved.set(X86::ST4); + Reserved.set(X86::ST5); + Reserved.set(X86::ST6); + Reserved.set(X86::ST7); + + // Reserve the registers that only exist in 64-bit mode. + if (!Is64Bit) { + // These 8-bit registers are part of the x86-64 extension even though their + // super-registers are old 32-bits. + Reserved.set(X86::SIL); + Reserved.set(X86::DIL); + Reserved.set(X86::BPL); + Reserved.set(X86::SPL); + + for (unsigned n = 0; n != 8; ++n) { + // R8, R9, ... + static const uint16_t GPR64[] = { + X86::R8, X86::R9, X86::R10, X86::R11, + X86::R12, X86::R13, X86::R14, X86::R15 + }; + for (MCRegAliasIterator AI(GPR64[n], this, true); AI.isValid(); ++AI) + Reserved.set(*AI); + + // XMM8, XMM9, ... + assert(X86::XMM15 == X86::XMM8+7); + for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI) + Reserved.set(*AI); + } + } + + return Reserved; +} + +//===----------------------------------------------------------------------===// +// Stack Frame Processing methods +//===----------------------------------------------------------------------===// + +bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + + if (!EnableBasePointer) + return false; + + // When we need stack realignment and there are dynamic allocas, we can't + // reference off of the stack pointer, so we reserve a base pointer. + // + // This is also true if the function contain MS-style inline assembly. We + // do this because if any stack changes occur in the inline assembly, e.g., + // "pusha", then any C local variable or C argument references in the + // inline assembly will be wrong because the SP is not properly tracked. + if ((needsStackRealignment(MF) && MFI->hasVarSizedObjects()) || + MF.hasMSInlineAsm()) + return true; + + return false; +} + +bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + if (!MF.getTarget().Options.RealignStack) + return false; + + // Stack realignment requires a frame pointer. If we already started + // register allocation with frame pointer elimination, it is too late now. + if (!MRI->canReserveReg(FramePtr)) + return false; + + // If a base pointer is necessary. Check that it isn't too late to reserve + // it. + if (MFI->hasVarSizedObjects()) + return MRI->canReserveReg(BasePtr); + return true; +} + +bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const Function *F = MF.getFunction(); + unsigned StackAlign = TM.getFrameLowering()->getStackAlignment(); + bool requiresRealignment = + ((MFI->getMaxAlignment() > StackAlign) || + F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::StackAlignment)); + + // If we've requested that we force align the stack do so now. + if (ForceStackAlign) + return canRealignStack(MF); + + return requiresRealignment && canRealignStack(MF); +} + +bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, + unsigned Reg, int &FrameIdx) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + if (Reg == FramePtr && TFI->hasFP(MF)) { + FrameIdx = MF.getFrameInfo()->getObjectIndexBegin(); + return true; + } + return false; +} + +void +X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, unsigned FIOperandNum, + RegScavenger *RS) const { + assert(SPAdj == 0 && "Unexpected"); + + MachineInstr &MI = *II; + MachineFunction &MF = *MI.getParent()->getParent(); + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); + unsigned BasePtr; + + unsigned Opc = MI.getOpcode(); + bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm; + if (hasBasePointer(MF)) + BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister()); + else if (needsStackRealignment(MF)) + BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr); + else if (AfterFPPop) + BasePtr = StackPtr; + else + BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr); + + // This must be part of a four operand memory reference. Replace the + // FrameIndex with base register with EBP. Add an offset to the offset. + MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false); + + // Now add the frame object offset to the offset from EBP. + int FIOffset; + if (AfterFPPop) { + // Tail call jmp happens after FP is popped. + const MachineFrameInfo *MFI = MF.getFrameInfo(); + FIOffset = MFI->getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea(); + } else + FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex); + + if (MI.getOperand(FIOperandNum+3).isImm()) { + // Offset is a 32-bit integer. + int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm()); + int Offset = FIOffset + Imm; + assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) && + "Requesting 64-bit offset in 32-bit immediate!"); + MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset); + } else { + // Offset is symbolic. This is extremely rare. + uint64_t Offset = FIOffset + + (uint64_t)MI.getOperand(FIOperandNum+3).getOffset(); + MI.getOperand(FIOperandNum + 3).setOffset(Offset); + } +} + +unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + return TFI->hasFP(MF) ? FramePtr : StackPtr; +} + +unsigned X86RegisterInfo::getEHExceptionRegister() const { + llvm_unreachable("What is the exception register"); +} + +unsigned X86RegisterInfo::getEHHandlerRegister() const { + llvm_unreachable("What is the exception handler register"); +} + +namespace llvm { +unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT, + bool High) { + switch (VT) { + default: llvm_unreachable("Unexpected VT"); + case MVT::i8: + if (High) { + switch (Reg) { + default: return getX86SubSuperRegister(Reg, MVT::i64); + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::SI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::DI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::BP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::SP; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::AH; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::DH; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::CH; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::BH; + } + } else { + switch (Reg) { + default: llvm_unreachable("Unexpected register"); + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::AL; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::DL; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::CL; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::BL; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::SIL; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::DIL; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::BPL; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::SPL; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8B; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9B; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10B; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11B; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12B; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13B; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14B; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15B; + } + } + case MVT::i16: + switch (Reg) { + default: llvm_unreachable("Unexpected register"); + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::AX; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::DX; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::CX; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::BX; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::SI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::DI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::BP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::SP; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8W; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9W; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10W; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11W; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12W; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13W; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14W; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15W; + } + case MVT::i32: + switch (Reg) { + default: llvm_unreachable("Unexpected register"); + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::EAX; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::EDX; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::ECX; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::EBX; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::ESI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::EDI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::EBP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::ESP; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8D; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9D; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10D; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11D; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12D; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13D; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14D; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15D; + } + case MVT::i64: + switch (Reg) { + default: llvm_unreachable("Unexpected register"); + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::RAX; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::RDX; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::RCX; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::RBX; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::RSI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::RDI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::RBP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::RSP; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15; + } + } +} +} diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h new file mode 100644 index 0000000..b9d7b8c --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h @@ -0,0 +1,143 @@ +//===-- X86RegisterInfo.h - X86 Register Information Impl -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of the TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef X86REGISTERINFO_H +#define X86REGISTERINFO_H + +#include "llvm/Target/TargetRegisterInfo.h" + +#define GET_REGINFO_HEADER +#include "X86GenRegisterInfo.inc" + +namespace llvm { + class Type; + class TargetInstrInfo; + class X86TargetMachine; + +class X86RegisterInfo : public X86GenRegisterInfo { +public: + X86TargetMachine &TM; + const TargetInstrInfo &TII; + +private: + /// Is64Bit - Is the target 64-bits. + /// + bool Is64Bit; + + /// IsWin64 - Is the target on of win64 flavours + /// + bool IsWin64; + + /// SlotSize - Stack slot size in bytes. + /// + unsigned SlotSize; + + /// StackPtr - X86 physical register used as stack ptr. + /// + unsigned StackPtr; + + /// FramePtr - X86 physical register used as frame ptr. + /// + unsigned FramePtr; + + /// BasePtr - X86 physical register used as a base ptr in complex stack + /// frames. I.e., when we need a 3rd base, not just SP and FP, due to + /// variable size stack objects. + unsigned BasePtr; + +public: + X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii); + + // FIXME: This should be tablegen'd like getDwarfRegNum is + int getSEHRegNum(unsigned i) const; + + /// getCompactUnwindRegNum - This function maps the register to the number for + /// compact unwind encoding. Return -1 if the register isn't valid. + int getCompactUnwindRegNum(unsigned RegNum, bool isEH) const; + + /// Code Generation virtual methods... + /// + virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const; + + /// getMatchingSuperRegClass - Return a subclass of the specified register + /// class A so that each register in it has a sub-register of the + /// specified sub-register index which is in the specified register class B. + virtual const TargetRegisterClass * + getMatchingSuperRegClass(const TargetRegisterClass *A, + const TargetRegisterClass *B, unsigned Idx) const; + + virtual const TargetRegisterClass * + getSubClassWithSubReg(const TargetRegisterClass *RC, unsigned Idx) const; + + const TargetRegisterClass* + getLargestLegalSuperClass(const TargetRegisterClass *RC) const; + + /// getPointerRegClass - Returns a TargetRegisterClass used for pointer + /// values. + const TargetRegisterClass * + getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const; + + /// getCrossCopyRegClass - Returns a legal register class to copy a register + /// in the specified class to or from. Returns NULL if it is possible to copy + /// between a two registers of the specified class. + const TargetRegisterClass * + getCrossCopyRegClass(const TargetRegisterClass *RC) const; + + unsigned getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const; + + /// getCalleeSavedRegs - Return a null-terminated list of all of the + /// callee-save registers on this target. + const uint16_t *getCalleeSavedRegs(const MachineFunction* MF = 0) const; + const uint32_t *getCallPreservedMask(CallingConv::ID) const; + const uint32_t *getNoPreservedMask() const; + + /// getReservedRegs - Returns a bitset indexed by physical register number + /// indicating if a register is a special register that has particular uses and + /// should be considered unavailable at all times, e.g. SP, RA. This is used by + /// register scavenger to determine what registers are free. + BitVector getReservedRegs(const MachineFunction &MF) const; + + bool hasBasePointer(const MachineFunction &MF) const; + + bool canRealignStack(const MachineFunction &MF) const; + + bool needsStackRealignment(const MachineFunction &MF) const; + + bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg, + int &FrameIdx) const; + + void eliminateFrameIndex(MachineBasicBlock::iterator MI, + int SPAdj, unsigned FIOperandNum, + RegScavenger *RS = NULL) const; + + // Debug information queries. + unsigned getFrameRegister(const MachineFunction &MF) const; + unsigned getStackRegister() const { return StackPtr; } + unsigned getBaseRegister() const { return BasePtr; } + // FIXME: Move to FrameInfok + unsigned getSlotSize() const { return SlotSize; } + + // Exception handling queries. + unsigned getEHExceptionRegister() const; + unsigned getEHHandlerRegister() const; +}; + +// getX86SubSuperRegister - X86 utility function. It returns the sub or super +// register of a specific X86 register. +// e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) return X86:AX +unsigned getX86SubSuperRegister(unsigned, MVT::SimpleValueType, bool High=false); + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td new file mode 100644 index 0000000..be6282a --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td @@ -0,0 +1,423 @@ +//===- X86RegisterInfo.td - Describe the X86 Register File --*- tablegen -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 Register file, defining the registers themselves, +// aliases between the registers, and the register classes built out of the +// registers. +// +//===----------------------------------------------------------------------===// + +class X86Reg<string n, bits<16> Enc, list<Register> subregs = []> : Register<n> { + let Namespace = "X86"; + let HWEncoding = Enc; + let SubRegs = subregs; +} + +// Subregister indices. +let Namespace = "X86" in { + def sub_8bit : SubRegIndex; + def sub_8bit_hi : SubRegIndex; + def sub_16bit : SubRegIndex; + def sub_32bit : SubRegIndex; + def sub_xmm : SubRegIndex; +} + +//===----------------------------------------------------------------------===// +// Register definitions... +// + +// In the register alias definitions below, we define which registers alias +// which others. We only specify which registers the small registers alias, +// because the register file generator is smart enough to figure out that +// AL aliases AX if we tell it that AX aliased AL (for example). + +// Dwarf numbering is different for 32-bit and 64-bit, and there are +// variations by target as well. Currently the first entry is for X86-64, +// second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux +// and debug information on X86-32/Darwin) + +// 8-bit registers +// Low registers +def AL : X86Reg<"al", 0>; +def DL : X86Reg<"dl", 2>; +def CL : X86Reg<"cl", 1>; +def BL : X86Reg<"bl", 3>; + +// High registers. On x86-64, these cannot be used in any instruction +// with a REX prefix. +def AH : X86Reg<"ah", 4>; +def DH : X86Reg<"dh", 6>; +def CH : X86Reg<"ch", 5>; +def BH : X86Reg<"bh", 7>; + +// X86-64 only, requires REX. +let CostPerUse = 1 in { +def SIL : X86Reg<"sil", 6>; +def DIL : X86Reg<"dil", 7>; +def BPL : X86Reg<"bpl", 5>; +def SPL : X86Reg<"spl", 4>; +def R8B : X86Reg<"r8b", 8>; +def R9B : X86Reg<"r9b", 9>; +def R10B : X86Reg<"r10b", 10>; +def R11B : X86Reg<"r11b", 11>; +def R12B : X86Reg<"r12b", 12>; +def R13B : X86Reg<"r13b", 13>; +def R14B : X86Reg<"r14b", 14>; +def R15B : X86Reg<"r15b", 15>; +} + +// 16-bit registers +let SubRegIndices = [sub_8bit, sub_8bit_hi], CoveredBySubRegs = 1 in { +def AX : X86Reg<"ax", 0, [AL,AH]>; +def DX : X86Reg<"dx", 2, [DL,DH]>; +def CX : X86Reg<"cx", 1, [CL,CH]>; +def BX : X86Reg<"bx", 3, [BL,BH]>; +} +let SubRegIndices = [sub_8bit] in { +def SI : X86Reg<"si", 6, [SIL]>; +def DI : X86Reg<"di", 7, [DIL]>; +def BP : X86Reg<"bp", 5, [BPL]>; +def SP : X86Reg<"sp", 4, [SPL]>; +} +def IP : X86Reg<"ip", 0>; + +// X86-64 only, requires REX. +let SubRegIndices = [sub_8bit], CostPerUse = 1 in { +def R8W : X86Reg<"r8w", 8, [R8B]>; +def R9W : X86Reg<"r9w", 9, [R9B]>; +def R10W : X86Reg<"r10w", 10, [R10B]>; +def R11W : X86Reg<"r11w", 11, [R11B]>; +def R12W : X86Reg<"r12w", 12, [R12B]>; +def R13W : X86Reg<"r13w", 13, [R13B]>; +def R14W : X86Reg<"r14w", 14, [R14B]>; +def R15W : X86Reg<"r15w", 15, [R15B]>; +} + +// 32-bit registers +let SubRegIndices = [sub_16bit] in { +def EAX : X86Reg<"eax", 0, [AX]>, DwarfRegNum<[-2, 0, 0]>; +def EDX : X86Reg<"edx", 2, [DX]>, DwarfRegNum<[-2, 2, 2]>; +def ECX : X86Reg<"ecx", 1, [CX]>, DwarfRegNum<[-2, 1, 1]>; +def EBX : X86Reg<"ebx", 3, [BX]>, DwarfRegNum<[-2, 3, 3]>; +def ESI : X86Reg<"esi", 6, [SI]>, DwarfRegNum<[-2, 6, 6]>; +def EDI : X86Reg<"edi", 7, [DI]>, DwarfRegNum<[-2, 7, 7]>; +def EBP : X86Reg<"ebp", 5, [BP]>, DwarfRegNum<[-2, 4, 5]>; +def ESP : X86Reg<"esp", 4, [SP]>, DwarfRegNum<[-2, 5, 4]>; +def EIP : X86Reg<"eip", 0, [IP]>, DwarfRegNum<[-2, 8, 8]>; + +// X86-64 only, requires REX +let CostPerUse = 1 in { +def R8D : X86Reg<"r8d", 8, [R8W]>; +def R9D : X86Reg<"r9d", 9, [R9W]>; +def R10D : X86Reg<"r10d", 10, [R10W]>; +def R11D : X86Reg<"r11d", 11, [R11W]>; +def R12D : X86Reg<"r12d", 12, [R12W]>; +def R13D : X86Reg<"r13d", 13, [R13W]>; +def R14D : X86Reg<"r14d", 14, [R14W]>; +def R15D : X86Reg<"r15d", 15, [R15W]>; +}} + +// 64-bit registers, X86-64 only +let SubRegIndices = [sub_32bit] in { +def RAX : X86Reg<"rax", 0, [EAX]>, DwarfRegNum<[0, -2, -2]>; +def RDX : X86Reg<"rdx", 2, [EDX]>, DwarfRegNum<[1, -2, -2]>; +def RCX : X86Reg<"rcx", 1, [ECX]>, DwarfRegNum<[2, -2, -2]>; +def RBX : X86Reg<"rbx", 3, [EBX]>, DwarfRegNum<[3, -2, -2]>; +def RSI : X86Reg<"rsi", 6, [ESI]>, DwarfRegNum<[4, -2, -2]>; +def RDI : X86Reg<"rdi", 7, [EDI]>, DwarfRegNum<[5, -2, -2]>; +def RBP : X86Reg<"rbp", 5, [EBP]>, DwarfRegNum<[6, -2, -2]>; +def RSP : X86Reg<"rsp", 4, [ESP]>, DwarfRegNum<[7, -2, -2]>; + +// These also require REX. +let CostPerUse = 1 in { +def R8 : X86Reg<"r8", 8, [R8D]>, DwarfRegNum<[ 8, -2, -2]>; +def R9 : X86Reg<"r9", 9, [R9D]>, DwarfRegNum<[ 9, -2, -2]>; +def R10 : X86Reg<"r10", 10, [R10D]>, DwarfRegNum<[10, -2, -2]>; +def R11 : X86Reg<"r11", 11, [R11D]>, DwarfRegNum<[11, -2, -2]>; +def R12 : X86Reg<"r12", 12, [R12D]>, DwarfRegNum<[12, -2, -2]>; +def R13 : X86Reg<"r13", 13, [R13D]>, DwarfRegNum<[13, -2, -2]>; +def R14 : X86Reg<"r14", 14, [R14D]>, DwarfRegNum<[14, -2, -2]>; +def R15 : X86Reg<"r15", 15, [R15D]>, DwarfRegNum<[15, -2, -2]>; +def RIP : X86Reg<"rip", 0, [EIP]>, DwarfRegNum<[16, -2, -2]>; +}} + +// MMX Registers. These are actually aliased to ST0 .. ST7 +def MM0 : X86Reg<"mm0", 0>, DwarfRegNum<[41, 29, 29]>; +def MM1 : X86Reg<"mm1", 1>, DwarfRegNum<[42, 30, 30]>; +def MM2 : X86Reg<"mm2", 2>, DwarfRegNum<[43, 31, 31]>; +def MM3 : X86Reg<"mm3", 3>, DwarfRegNum<[44, 32, 32]>; +def MM4 : X86Reg<"mm4", 4>, DwarfRegNum<[45, 33, 33]>; +def MM5 : X86Reg<"mm5", 5>, DwarfRegNum<[46, 34, 34]>; +def MM6 : X86Reg<"mm6", 6>, DwarfRegNum<[47, 35, 35]>; +def MM7 : X86Reg<"mm7", 7>, DwarfRegNum<[48, 36, 36]>; + +// Pseudo Floating Point registers +def FP0 : X86Reg<"fp0", 0>; +def FP1 : X86Reg<"fp1", 0>; +def FP2 : X86Reg<"fp2", 0>; +def FP3 : X86Reg<"fp3", 0>; +def FP4 : X86Reg<"fp4", 0>; +def FP5 : X86Reg<"fp5", 0>; +def FP6 : X86Reg<"fp6", 0>; + +// XMM Registers, used by the various SSE instruction set extensions. +def XMM0: X86Reg<"xmm0", 0>, DwarfRegNum<[17, 21, 21]>; +def XMM1: X86Reg<"xmm1", 1>, DwarfRegNum<[18, 22, 22]>; +def XMM2: X86Reg<"xmm2", 2>, DwarfRegNum<[19, 23, 23]>; +def XMM3: X86Reg<"xmm3", 3>, DwarfRegNum<[20, 24, 24]>; +def XMM4: X86Reg<"xmm4", 4>, DwarfRegNum<[21, 25, 25]>; +def XMM5: X86Reg<"xmm5", 5>, DwarfRegNum<[22, 26, 26]>; +def XMM6: X86Reg<"xmm6", 6>, DwarfRegNum<[23, 27, 27]>; +def XMM7: X86Reg<"xmm7", 7>, DwarfRegNum<[24, 28, 28]>; + +// X86-64 only +let CostPerUse = 1 in { +def XMM8: X86Reg<"xmm8", 8>, DwarfRegNum<[25, -2, -2]>; +def XMM9: X86Reg<"xmm9", 9>, DwarfRegNum<[26, -2, -2]>; +def XMM10: X86Reg<"xmm10", 10>, DwarfRegNum<[27, -2, -2]>; +def XMM11: X86Reg<"xmm11", 11>, DwarfRegNum<[28, -2, -2]>; +def XMM12: X86Reg<"xmm12", 12>, DwarfRegNum<[29, -2, -2]>; +def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>; +def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>; +def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>; +} // CostPerUse + +// YMM Registers, used by AVX instructions +let SubRegIndices = [sub_xmm] in { +def YMM0: X86Reg<"ymm0", 0, [XMM0]>, DwarfRegAlias<XMM0>; +def YMM1: X86Reg<"ymm1", 1, [XMM1]>, DwarfRegAlias<XMM1>; +def YMM2: X86Reg<"ymm2", 2, [XMM2]>, DwarfRegAlias<XMM2>; +def YMM3: X86Reg<"ymm3", 3, [XMM3]>, DwarfRegAlias<XMM3>; +def YMM4: X86Reg<"ymm4", 4, [XMM4]>, DwarfRegAlias<XMM4>; +def YMM5: X86Reg<"ymm5", 5, [XMM5]>, DwarfRegAlias<XMM5>; +def YMM6: X86Reg<"ymm6", 6, [XMM6]>, DwarfRegAlias<XMM6>; +def YMM7: X86Reg<"ymm7", 7, [XMM7]>, DwarfRegAlias<XMM7>; +def YMM8: X86Reg<"ymm8", 8, [XMM8]>, DwarfRegAlias<XMM8>; +def YMM9: X86Reg<"ymm9", 9, [XMM9]>, DwarfRegAlias<XMM9>; +def YMM10: X86Reg<"ymm10", 10, [XMM10]>, DwarfRegAlias<XMM10>; +def YMM11: X86Reg<"ymm11", 11, [XMM11]>, DwarfRegAlias<XMM11>; +def YMM12: X86Reg<"ymm12", 12, [XMM12]>, DwarfRegAlias<XMM12>; +def YMM13: X86Reg<"ymm13", 13, [XMM13]>, DwarfRegAlias<XMM13>; +def YMM14: X86Reg<"ymm14", 14, [XMM14]>, DwarfRegAlias<XMM14>; +def YMM15: X86Reg<"ymm15", 15, [XMM15]>, DwarfRegAlias<XMM15>; +} + +class STRegister<string n, bits<16> Enc, list<Register> A> : X86Reg<n, Enc> { + let Aliases = A; +} + +// Floating point stack registers. These don't map one-to-one to the FP +// pseudo registers, but we still mark them as aliasing FP registers. That +// way both kinds can be live without exceeding the stack depth. ST registers +// are only live around inline assembly. +def ST0 : STRegister<"st(0)", 0, []>, DwarfRegNum<[33, 12, 11]>; +def ST1 : STRegister<"st(1)", 1, [FP6]>, DwarfRegNum<[34, 13, 12]>; +def ST2 : STRegister<"st(2)", 2, [FP5]>, DwarfRegNum<[35, 14, 13]>; +def ST3 : STRegister<"st(3)", 3, [FP4]>, DwarfRegNum<[36, 15, 14]>; +def ST4 : STRegister<"st(4)", 4, [FP3]>, DwarfRegNum<[37, 16, 15]>; +def ST5 : STRegister<"st(5)", 5, [FP2]>, DwarfRegNum<[38, 17, 16]>; +def ST6 : STRegister<"st(6)", 6, [FP1]>, DwarfRegNum<[39, 18, 17]>; +def ST7 : STRegister<"st(7)", 7, [FP0]>, DwarfRegNum<[40, 19, 18]>; + +// Floating-point status word +def FPSW : X86Reg<"fpsw", 0>; + +// Status flags register +def EFLAGS : X86Reg<"flags", 0>; + +// Segment registers +def CS : X86Reg<"cs", 1>; +def DS : X86Reg<"ds", 3>; +def SS : X86Reg<"ss", 2>; +def ES : X86Reg<"es", 0>; +def FS : X86Reg<"fs", 4>; +def GS : X86Reg<"gs", 5>; + +// Debug registers +def DR0 : X86Reg<"dr0", 0>; +def DR1 : X86Reg<"dr1", 1>; +def DR2 : X86Reg<"dr2", 2>; +def DR3 : X86Reg<"dr3", 3>; +def DR4 : X86Reg<"dr4", 4>; +def DR5 : X86Reg<"dr5", 5>; +def DR6 : X86Reg<"dr6", 6>; +def DR7 : X86Reg<"dr7", 7>; + +// Control registers +def CR0 : X86Reg<"cr0", 0>; +def CR1 : X86Reg<"cr1", 1>; +def CR2 : X86Reg<"cr2", 2>; +def CR3 : X86Reg<"cr3", 3>; +def CR4 : X86Reg<"cr4", 4>; +def CR5 : X86Reg<"cr5", 5>; +def CR6 : X86Reg<"cr6", 6>; +def CR7 : X86Reg<"cr7", 7>; +def CR8 : X86Reg<"cr8", 8>; +def CR9 : X86Reg<"cr9", 9>; +def CR10 : X86Reg<"cr10", 10>; +def CR11 : X86Reg<"cr11", 11>; +def CR12 : X86Reg<"cr12", 12>; +def CR13 : X86Reg<"cr13", 13>; +def CR14 : X86Reg<"cr14", 14>; +def CR15 : X86Reg<"cr15", 15>; + +// Pseudo index registers +def EIZ : X86Reg<"eiz", 4>; +def RIZ : X86Reg<"riz", 4>; + + +//===----------------------------------------------------------------------===// +// Register Class Definitions... now that we have all of the pieces, define the +// top-level register classes. The order specified in the register list is +// implicitly defined to be the register allocation order. +// + +// List call-clobbered registers before callee-save registers. RBX, RBP, (and +// R12, R13, R14, and R15 for X86-64) are callee-save registers. +// In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and +// R8B, ... R15B. +// Allocate R12 and R13 last, as these require an extra byte when +// encoded in x86_64 instructions. +// FIXME: Allow AH, CH, DH, BH to be used as general-purpose registers in +// 64-bit mode. The main complication is that they cannot be encoded in an +// instruction requiring a REX prefix, while SIL, DIL, BPL, R8D, etc. +// require a REX prefix. For example, "addb %ah, %dil" and "movzbl %ah, %r8d" +// cannot be encoded. +def GR8 : RegisterClass<"X86", [i8], 8, + (add AL, CL, DL, AH, CH, DH, BL, BH, SIL, DIL, BPL, SPL, + R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B)> { + let AltOrders = [(sub GR8, AH, BH, CH, DH)]; + let AltOrderSelect = [{ + return MF.getTarget().getSubtarget<X86Subtarget>().is64Bit(); + }]; +} + +def GR16 : RegisterClass<"X86", [i16], 16, + (add AX, CX, DX, SI, DI, BX, BP, SP, + R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)>; + +def GR32 : RegisterClass<"X86", [i32], 32, + (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP, + R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)>; + +// GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since +// RIP isn't really a register and it can't be used anywhere except in an +// address, but it doesn't cause trouble. +def GR64 : RegisterClass<"X86", [i64], 64, + (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, + RBX, R14, R15, R12, R13, RBP, RSP, RIP)>; + +// Segment registers for use by MOV instructions (and others) that have a +// segment register as one operand. Always contain a 16-bit segment +// descriptor. +def SEGMENT_REG : RegisterClass<"X86", [i16], 16, (add CS, DS, SS, ES, FS, GS)>; + +// Debug registers. +def DEBUG_REG : RegisterClass<"X86", [i32], 32, (sequence "DR%u", 0, 7)>; + +// Control registers. +def CONTROL_REG : RegisterClass<"X86", [i64], 64, (sequence "CR%u", 0, 15)>; + +// GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD, GR64_ABCD - Subclasses of +// GR8, GR16, GR32, and GR64 which contain just the "a" "b", "c", and "d" +// registers. On x86-32, GR16_ABCD and GR32_ABCD are classes for registers +// that support 8-bit subreg operations. On x86-64, GR16_ABCD, GR32_ABCD, +// and GR64_ABCD are classes for registers that support 8-bit h-register +// operations. +def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, BL)>; +def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, (add AH, CH, DH, BH)>; +def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)>; +def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)>; +def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)>; +def GR32_TC : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>; +def GR64_TC : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI, + R8, R9, R11, RIP)>; +def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, + R8, R9, R11)>; + +// GR8_NOREX - GR8 registers which do not require a REX prefix. +def GR8_NOREX : RegisterClass<"X86", [i8], 8, + (add AL, CL, DL, AH, CH, DH, BL, BH)> { + let AltOrders = [(sub GR8_NOREX, AH, BH, CH, DH)]; + let AltOrderSelect = [{ + return MF.getTarget().getSubtarget<X86Subtarget>().is64Bit(); + }]; +} +// GR16_NOREX - GR16 registers which do not require a REX prefix. +def GR16_NOREX : RegisterClass<"X86", [i16], 16, + (add AX, CX, DX, SI, DI, BX, BP, SP)>; +// GR32_NOREX - GR32 registers which do not require a REX prefix. +def GR32_NOREX : RegisterClass<"X86", [i32], 32, + (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)>; +// GR64_NOREX - GR64 registers which do not require a REX prefix. +def GR64_NOREX : RegisterClass<"X86", [i64], 64, + (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)>; + +// GR32_NOAX - GR32 registers except EAX. Used by AddRegFrm of XCHG32 in 64-bit +// mode to prevent encoding using the 0x90 NOP encoding. xchg %eax, %eax needs +// to clear upper 32-bits of RAX so is not a NOP. +def GR32_NOAX : RegisterClass<"X86", [i32], 32, (sub GR32, EAX)>; + +// GR32_NOSP - GR32 registers except ESP. +def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)>; + +// GR64_NOSP - GR64 registers except RSP (and RIP). +def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)>; + +// GR32_NOREX_NOSP - GR32 registers which do not require a REX prefix except +// ESP. +def GR32_NOREX_NOSP : RegisterClass<"X86", [i32], 32, + (and GR32_NOREX, GR32_NOSP)>; + +// GR64_NOREX_NOSP - GR64_NOREX registers except RSP. +def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64, + (and GR64_NOREX, GR64_NOSP)>; + +// A class to support the 'A' assembler constraint: EAX then EDX. +def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)>; + +// Scalar SSE2 floating point registers. +def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>; + +def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>; + + +// FIXME: This sets up the floating point register files as though they are f64 +// values, though they really are f80 values. This will cause us to spill +// values as 64-bit quantities instead of 80-bit quantities, which is much much +// faster on common hardware. In reality, this should be controlled by a +// command line option or something. + +def RFP32 : RegisterClass<"X86",[f32], 32, (sequence "FP%u", 0, 6)>; +def RFP64 : RegisterClass<"X86",[f64], 32, (add RFP32)>; +def RFP80 : RegisterClass<"X86",[f80], 32, (add RFP32)>; + +// Floating point stack registers (these are not allocatable by the +// register allocator - the floating point stackifier is responsible +// for transforming FPn allocations to STn registers) +def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> { + let isAllocatable = 0; +} + +// Generic vector registers: VR64 and VR128. +def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>; +def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + 128, (add FR32)>; +def VR256 : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + 256, (sequence "YMM%u", 0, 15)>; + +// Status flags registers. +def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> { + let CopyCost = -1; // Don't allow copying of status registers. + let isAllocatable = 0; +} +def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> { + let CopyCost = -1; // Don't allow copying of status registers. + let isAllocatable = 0; +} diff --git a/contrib/llvm/lib/Target/X86/X86Relocations.h b/contrib/llvm/lib/Target/X86/X86Relocations.h new file mode 100644 index 0000000..0333056 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86Relocations.h @@ -0,0 +1,52 @@ +//===-- X86Relocations.h - X86 Code Relocations -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the X86 target-specific relocation types. +// +//===----------------------------------------------------------------------===// + +#ifndef X86RELOCATIONS_H +#define X86RELOCATIONS_H + +#include "llvm/CodeGen/MachineRelocation.h" + +namespace llvm { + namespace X86 { + /// RelocationType - An enum for the x86 relocation codes. Note that + /// the terminology here doesn't follow x86 convention - word means + /// 32-bit and dword means 64-bit. The relocations will be treated + /// by JIT or ObjectCode emitters, this is transparent to the x86 code + /// emitter but JIT and ObjectCode will treat them differently + enum RelocationType { + /// reloc_pcrel_word - PC relative relocation, add the relocated value to + /// the value already in memory, after we adjust it for where the PC is. + reloc_pcrel_word = 0, + + /// reloc_picrel_word - PIC base relative relocation, add the relocated + /// value to the value already in memory, after we adjust it for where the + /// PIC base is. + reloc_picrel_word = 1, + + /// reloc_absolute_word - absolute relocation, just add the relocated + /// value to the value already in memory. + reloc_absolute_word = 2, + + /// reloc_absolute_word_sext - absolute relocation, just add the relocated + /// value to the value already in memory. In object files, it represents a + /// value which must be sign-extended when resolving the relocation. + reloc_absolute_word_sext = 3, + + /// reloc_absolute_dword - absolute relocation, just add the relocated + /// value to the value already in memory. + reloc_absolute_dword = 4 + }; + } +} + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td new file mode 100644 index 0000000..84c9203 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td @@ -0,0 +1,126 @@ +//=- X86SchedHaswell.td - X86 Haswell Scheduling -------------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Haswell to support instruction +// scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def HaswellModel : SchedMachineModel { + // All x86 instructions are modeled as a single micro-op, and HW can decode 4 + // instructions per cycle. + let IssueWidth = 4; + let MinLatency = 0; // 0 = Out-of-order execution. + let LoadLatency = 4; + let ILPWindow = 30; + let MispredictPenalty = 16; +} + +let SchedModel = HaswellModel in { + +// Haswell can issue micro-ops to 8 different ports in one cycle. + +// Ports 0, 1, 5, 6 and 7 handle all computation. +// Port 4 gets the data half of stores. Store data can be available later than +// the store address, but since we don't model the latency of stores, we can +// ignore that. +// Ports 2 and 3 are identical. They handle loads and the address half of +// stores. Port 7 can handle address calculations. +def HWPort0 : ProcResource<1>; +def HWPort1 : ProcResource<1>; +def HWPort2 : ProcResource<1>; +def HWPort3 : ProcResource<1>; +def HWPort4 : ProcResource<1>; +def HWPort5 : ProcResource<1>; +def HWPort6 : ProcResource<1>; +def HWPort7 : ProcResource<1>; + +// Many micro-ops are capable of issuing on multiple ports. +def HWPort23 : ProcResGroup<[HWPort2, HWPort3]>; +def HWPort237 : ProcResGroup<[HWPort2, HWPort3, HWPort7]>; +def HWPort05 : ProcResGroup<[HWPort0, HWPort5]>; +def HWPort056 : ProcResGroup<[HWPort0, HWPort5, HWPort6]>; +def HWPort15 : ProcResGroup<[HWPort1, HWPort5]>; +def HWPort015 : ProcResGroup<[HWPort0, HWPort1, HWPort5]>; +def HWPort0156: ProcResGroup<[HWPort0, HWPort1, HWPort5, HWPort6]>; + +// Integer division issued on port 0. +def HWDivider : ProcResource<1>; + +// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4 +// cycles after the memory operand. +def : ReadAdvance<ReadAfterLd, 4>; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when queued in the reservation station. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass HWWriteResPair<X86FoldableSchedWrite SchedRW, + ProcResourceKind ExePort, + int Lat> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + + // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the + // latency. + def : WriteRes<SchedRW.Folded, [HWPort23, ExePort]> { + let Latency = !add(Lat, 4); + } +} + +// A folded store needs a cycle on port 4 for the store data, but it does not +// need an extra port 2/3 cycle to recompute the address. +def : WriteRes<WriteRMW, [HWPort4]>; + +def : WriteRes<WriteStore, [HWPort237, HWPort4]>; +def : WriteRes<WriteLoad, [HWPort23]> { let Latency = 4; } +def : WriteRes<WriteMove, [HWPort0156]>; +def : WriteRes<WriteZero, []>; + +defm : HWWriteResPair<WriteALU, HWPort0156, 1>; +defm : HWWriteResPair<WriteIMul, HWPort1, 3>; +defm : HWWriteResPair<WriteShift, HWPort056, 1>; +defm : HWWriteResPair<WriteJump, HWPort5, 1>; + +// This is for simple LEAs with one or two input operands. +// The complex ones can only execute on port 1, and they require two cycles on +// the port to read all inputs. We don't model that. +def : WriteRes<WriteLEA, [HWPort15]>; + +// This is quite rough, latency depends on the dividend. +def : WriteRes<WriteIDiv, [HWPort0, HWDivider]> { + let Latency = 25; + let ResourceCycles = [1, 10]; +} +def : WriteRes<WriteIDivLd, [HWPort23, HWPort0, HWDivider]> { + let Latency = 29; + let ResourceCycles = [1, 1, 10]; +} + +// Scalar and vector floating point. +defm : HWWriteResPair<WriteFAdd, HWPort1, 3>; +defm : HWWriteResPair<WriteFMul, HWPort0, 5>; +defm : HWWriteResPair<WriteFDiv, HWPort0, 12>; // 10-14 cycles. +defm : HWWriteResPair<WriteFRcp, HWPort0, 5>; +defm : HWWriteResPair<WriteFSqrt, HWPort0, 15>; +defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>; +defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>; +defm : HWWriteResPair<WriteCvtF2F, HWPort1, 3>; + +// Vector integer operations. +defm : HWWriteResPair<WriteVecShift, HWPort05, 1>; +defm : HWWriteResPair<WriteVecLogic, HWPort015, 1>; +defm : HWWriteResPair<WriteVecALU, HWPort15, 1>; +defm : HWWriteResPair<WriteVecIMul, HWPort0, 5>; +defm : HWWriteResPair<WriteShuffle, HWPort15, 1>; + +def : WriteRes<WriteSystem, [HWPort0156]> { let Latency = 100; } +def : WriteRes<WriteMicrocoded, [HWPort0156]> { let Latency = 100; } +} // SchedModel diff --git a/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td new file mode 100644 index 0000000..b36b3ad --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -0,0 +1,122 @@ +//=- X86SchedSandyBridge.td - X86 Sandy Bridge Scheduling ----*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Sandy Bridge to support instruction +// scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def SandyBridgeModel : SchedMachineModel { + // All x86 instructions are modeled as a single micro-op, and SB can decode 4 + // instructions per cycle. + // FIXME: Identify instructions that aren't a single fused micro-op. + let IssueWidth = 4; + let MinLatency = 0; // 0 = Out-of-order execution. + let LoadLatency = 4; + let ILPWindow = 20; + let MispredictPenalty = 16; +} + +let SchedModel = SandyBridgeModel in { + +// Sandy Bridge can issue micro-ops to 6 different ports in one cycle. + +// Ports 0, 1, and 5 handle all computation. +def SBPort0 : ProcResource<1>; +def SBPort1 : ProcResource<1>; +def SBPort5 : ProcResource<1>; + +// Ports 2 and 3 are identical. They handle loads and the address half of +// stores. +def SBPort23 : ProcResource<2>; + +// Port 4 gets the data half of stores. Store data can be available later than +// the store address, but since we don't model the latency of stores, we can +// ignore that. +def SBPort4 : ProcResource<1>; + +// Many micro-ops are capable of issuing on multiple ports. +def SBPort05 : ProcResGroup<[SBPort0, SBPort5]>; +def SBPort15 : ProcResGroup<[SBPort1, SBPort5]>; +def SBPort015 : ProcResGroup<[SBPort0, SBPort1, SBPort5]>; + +// Integer division issued on port 0. +def SBDivider : ProcResource<1>; + +// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4 +// cycles after the memory operand. +def : ReadAdvance<ReadAfterLd, 4>; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when queued in the reservation station. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass SBWriteResPair<X86FoldableSchedWrite SchedRW, + ProcResourceKind ExePort, + int Lat> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + + // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the + // latency. + def : WriteRes<SchedRW.Folded, [SBPort23, ExePort]> { + let Latency = !add(Lat, 4); + } +} + +// A folded store needs a cycle on port 4 for the store data, but it does not +// need an extra port 2/3 cycle to recompute the address. +def : WriteRes<WriteRMW, [SBPort4]>; + +def : WriteRes<WriteStore, [SBPort23, SBPort4]>; +def : WriteRes<WriteLoad, [SBPort23]> { let Latency = 4; } +def : WriteRes<WriteMove, [SBPort015]>; +def : WriteRes<WriteZero, []>; + +defm : SBWriteResPair<WriteALU, SBPort015, 1>; +defm : SBWriteResPair<WriteIMul, SBPort1, 3>; +defm : SBWriteResPair<WriteShift, SBPort05, 1>; +defm : SBWriteResPair<WriteJump, SBPort5, 1>; + +// This is for simple LEAs with one or two input operands. +// The complex ones can only execute on port 1, and they require two cycles on +// the port to read all inputs. We don't model that. +def : WriteRes<WriteLEA, [SBPort15]>; + +// This is quite rough, latency depends on the dividend. +def : WriteRes<WriteIDiv, [SBPort0, SBDivider]> { + let Latency = 25; + let ResourceCycles = [1, 10]; +} +def : WriteRes<WriteIDivLd, [SBPort23, SBPort0, SBDivider]> { + let Latency = 29; + let ResourceCycles = [1, 1, 10]; +} + +// Scalar and vector floating point. +defm : SBWriteResPair<WriteFAdd, SBPort1, 3>; +defm : SBWriteResPair<WriteFMul, SBPort0, 5>; +defm : SBWriteResPair<WriteFDiv, SBPort0, 12>; // 10-14 cycles. +defm : SBWriteResPair<WriteFRcp, SBPort0, 5>; +defm : SBWriteResPair<WriteFSqrt, SBPort0, 15>; +defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>; +defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>; +defm : SBWriteResPair<WriteCvtF2F, SBPort1, 3>; + +// Vector integer operations. +defm : SBWriteResPair<WriteVecShift, SBPort05, 1>; +defm : SBWriteResPair<WriteVecLogic, SBPort015, 1>; +defm : SBWriteResPair<WriteVecALU, SBPort15, 1>; +defm : SBWriteResPair<WriteVecIMul, SBPort0, 5>; +defm : SBWriteResPair<WriteShuffle, SBPort15, 1>; + +def : WriteRes<WriteSystem, [SBPort015]> { let Latency = 100; } +def : WriteRes<WriteMicrocoded, [SBPort015]> { let Latency = 100; } +} // SchedModel diff --git a/contrib/llvm/lib/Target/X86/X86Schedule.td b/contrib/llvm/lib/Target/X86/X86Schedule.td new file mode 100644 index 0000000..9fbde88 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86Schedule.td @@ -0,0 +1,573 @@ +//===-- X86Schedule.td - X86 Scheduling Definitions --------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// InstrSchedModel annotations for out-of-order CPUs. +// +// These annotations are independent of the itinerary classes defined below. + +// Instructions with folded loads need to read the memory operand immediately, +// but other register operands don't have to be read until the load is ready. +// These operands are marked with ReadAfterLd. +def ReadAfterLd : SchedRead; + +// Instructions with both a load and a store folded are modeled as a folded +// load + WriteRMW. +def WriteRMW : SchedWrite; + +// Most instructions can fold loads, so almost every SchedWrite comes in two +// variants: With and without a folded load. +// An X86FoldableSchedWrite holds a reference to the corresponding SchedWrite +// with a folded load. +class X86FoldableSchedWrite : SchedWrite { + // The SchedWrite to use when a load is folded into the instruction. + SchedWrite Folded; +} + +// Multiclass that produces a linked pair of SchedWrites. +multiclass X86SchedWritePair { + // Register-Memory operation. + def Ld : SchedWrite; + // Register-Register operation. + def NAME : X86FoldableSchedWrite { + let Folded = !cast<SchedWrite>(NAME#"Ld"); + } +} + +// Arithmetic. +defm WriteALU : X86SchedWritePair; // Simple integer ALU op. +defm WriteIMul : X86SchedWritePair; // Integer multiplication. +defm WriteIDiv : X86SchedWritePair; // Integer division. +def WriteLEA : SchedWrite; // LEA instructions can't fold loads. + +// Integer shifts and rotates. +defm WriteShift : X86SchedWritePair; + +// Loads, stores, and moves, not folded with other operations. +def WriteLoad : SchedWrite; +def WriteStore : SchedWrite; +def WriteMove : SchedWrite; + +// Idioms that clear a register, like xorps %xmm0, %xmm0. +// These can often bypass execution ports completely. +def WriteZero : SchedWrite; + +// Branches don't produce values, so they have no latency, but they still +// consume resources. Indirect branches can fold loads. +defm WriteJump : X86SchedWritePair; + +// Floating point. This covers both scalar and vector operations. +defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare. +defm WriteFMul : X86SchedWritePair; // Floating point multiplication. +defm WriteFDiv : X86SchedWritePair; // Floating point division. +defm WriteFSqrt : X86SchedWritePair; // Floating point square root. +defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal. +defm WriteFMA : X86SchedWritePair; // Fused Multiply Add. + +// FMA Scheduling helper class. +class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } + +// Vector integer operations. +defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals. +defm WriteVecShift : X86SchedWritePair; // Vector integer shifts. +defm WriteVecIMul : X86SchedWritePair; // Vector integer multiply. + +// Vector bitwise operations. +// These are often used on both floating point and integer vectors. +defm WriteVecLogic : X86SchedWritePair; // Vector and/or/xor. +defm WriteShuffle : X86SchedWritePair; // Vector shuffles and blends. + +// Conversion between integer and float. +defm WriteCvtF2I : X86SchedWritePair; // Float -> Integer. +defm WriteCvtI2F : X86SchedWritePair; // Integer -> Float. +defm WriteCvtF2F : X86SchedWritePair; // Float -> Float size conversion. + +// Catch-all for expensive system instructions. +def WriteSystem : SchedWrite; + +// Old microcoded instructions that nobody use. +def WriteMicrocoded : SchedWrite; + +//===----------------------------------------------------------------------===// +// Instruction Itinerary classes used for X86 +def IIC_ALU_MEM : InstrItinClass; +def IIC_ALU_NONMEM : InstrItinClass; +def IIC_LEA : InstrItinClass; +def IIC_LEA_16 : InstrItinClass; +def IIC_MUL8 : InstrItinClass; +def IIC_MUL16_MEM : InstrItinClass; +def IIC_MUL16_REG : InstrItinClass; +def IIC_MUL32_MEM : InstrItinClass; +def IIC_MUL32_REG : InstrItinClass; +def IIC_MUL64 : InstrItinClass; +// imul by al, ax, eax, tax +def IIC_IMUL8 : InstrItinClass; +def IIC_IMUL16_MEM : InstrItinClass; +def IIC_IMUL16_REG : InstrItinClass; +def IIC_IMUL32_MEM : InstrItinClass; +def IIC_IMUL32_REG : InstrItinClass; +def IIC_IMUL64 : InstrItinClass; +// imul reg by reg|mem +def IIC_IMUL16_RM : InstrItinClass; +def IIC_IMUL16_RR : InstrItinClass; +def IIC_IMUL32_RM : InstrItinClass; +def IIC_IMUL32_RR : InstrItinClass; +def IIC_IMUL64_RM : InstrItinClass; +def IIC_IMUL64_RR : InstrItinClass; +// imul reg = reg/mem * imm +def IIC_IMUL16_RMI : InstrItinClass; +def IIC_IMUL16_RRI : InstrItinClass; +def IIC_IMUL32_RMI : InstrItinClass; +def IIC_IMUL32_RRI : InstrItinClass; +def IIC_IMUL64_RMI : InstrItinClass; +def IIC_IMUL64_RRI : InstrItinClass; +// div +def IIC_DIV8_MEM : InstrItinClass; +def IIC_DIV8_REG : InstrItinClass; +def IIC_DIV16 : InstrItinClass; +def IIC_DIV32 : InstrItinClass; +def IIC_DIV64 : InstrItinClass; +// idiv +def IIC_IDIV8 : InstrItinClass; +def IIC_IDIV16 : InstrItinClass; +def IIC_IDIV32 : InstrItinClass; +def IIC_IDIV64 : InstrItinClass; +// neg/not/inc/dec +def IIC_UNARY_REG : InstrItinClass; +def IIC_UNARY_MEM : InstrItinClass; +// add/sub/and/or/xor/adc/sbc/cmp/test +def IIC_BIN_MEM : InstrItinClass; +def IIC_BIN_NONMEM : InstrItinClass; +// shift/rotate +def IIC_SR : InstrItinClass; +// shift double +def IIC_SHD16_REG_IM : InstrItinClass; +def IIC_SHD16_REG_CL : InstrItinClass; +def IIC_SHD16_MEM_IM : InstrItinClass; +def IIC_SHD16_MEM_CL : InstrItinClass; +def IIC_SHD32_REG_IM : InstrItinClass; +def IIC_SHD32_REG_CL : InstrItinClass; +def IIC_SHD32_MEM_IM : InstrItinClass; +def IIC_SHD32_MEM_CL : InstrItinClass; +def IIC_SHD64_REG_IM : InstrItinClass; +def IIC_SHD64_REG_CL : InstrItinClass; +def IIC_SHD64_MEM_IM : InstrItinClass; +def IIC_SHD64_MEM_CL : InstrItinClass; +// cmov +def IIC_CMOV16_RM : InstrItinClass; +def IIC_CMOV16_RR : InstrItinClass; +def IIC_CMOV32_RM : InstrItinClass; +def IIC_CMOV32_RR : InstrItinClass; +def IIC_CMOV64_RM : InstrItinClass; +def IIC_CMOV64_RR : InstrItinClass; +// set +def IIC_SET_R : InstrItinClass; +def IIC_SET_M : InstrItinClass; +// jmp/jcc/jcxz +def IIC_Jcc : InstrItinClass; +def IIC_JCXZ : InstrItinClass; +def IIC_JMP_REL : InstrItinClass; +def IIC_JMP_REG : InstrItinClass; +def IIC_JMP_MEM : InstrItinClass; +def IIC_JMP_FAR_MEM : InstrItinClass; +def IIC_JMP_FAR_PTR : InstrItinClass; +// loop +def IIC_LOOP : InstrItinClass; +def IIC_LOOPE : InstrItinClass; +def IIC_LOOPNE : InstrItinClass; +// call +def IIC_CALL_RI : InstrItinClass; +def IIC_CALL_MEM : InstrItinClass; +def IIC_CALL_FAR_MEM : InstrItinClass; +def IIC_CALL_FAR_PTR : InstrItinClass; +// ret +def IIC_RET : InstrItinClass; +def IIC_RET_IMM : InstrItinClass; +//sign extension movs +def IIC_MOVSX : InstrItinClass; +def IIC_MOVSX_R16_R8 : InstrItinClass; +def IIC_MOVSX_R16_M8 : InstrItinClass; +def IIC_MOVSX_R16_R16 : InstrItinClass; +def IIC_MOVSX_R32_R32 : InstrItinClass; +//zero extension movs +def IIC_MOVZX : InstrItinClass; +def IIC_MOVZX_R16_R8 : InstrItinClass; +def IIC_MOVZX_R16_M8 : InstrItinClass; + +def IIC_REP_MOVS : InstrItinClass; +def IIC_REP_STOS : InstrItinClass; + +// SSE scalar/parallel binary operations +def IIC_SSE_ALU_F32S_RR : InstrItinClass; +def IIC_SSE_ALU_F32S_RM : InstrItinClass; +def IIC_SSE_ALU_F64S_RR : InstrItinClass; +def IIC_SSE_ALU_F64S_RM : InstrItinClass; +def IIC_SSE_MUL_F32S_RR : InstrItinClass; +def IIC_SSE_MUL_F32S_RM : InstrItinClass; +def IIC_SSE_MUL_F64S_RR : InstrItinClass; +def IIC_SSE_MUL_F64S_RM : InstrItinClass; +def IIC_SSE_DIV_F32S_RR : InstrItinClass; +def IIC_SSE_DIV_F32S_RM : InstrItinClass; +def IIC_SSE_DIV_F64S_RR : InstrItinClass; +def IIC_SSE_DIV_F64S_RM : InstrItinClass; +def IIC_SSE_ALU_F32P_RR : InstrItinClass; +def IIC_SSE_ALU_F32P_RM : InstrItinClass; +def IIC_SSE_ALU_F64P_RR : InstrItinClass; +def IIC_SSE_ALU_F64P_RM : InstrItinClass; +def IIC_SSE_MUL_F32P_RR : InstrItinClass; +def IIC_SSE_MUL_F32P_RM : InstrItinClass; +def IIC_SSE_MUL_F64P_RR : InstrItinClass; +def IIC_SSE_MUL_F64P_RM : InstrItinClass; +def IIC_SSE_DIV_F32P_RR : InstrItinClass; +def IIC_SSE_DIV_F32P_RM : InstrItinClass; +def IIC_SSE_DIV_F64P_RR : InstrItinClass; +def IIC_SSE_DIV_F64P_RM : InstrItinClass; + +def IIC_SSE_COMIS_RR : InstrItinClass; +def IIC_SSE_COMIS_RM : InstrItinClass; + +def IIC_SSE_HADDSUB_RR : InstrItinClass; +def IIC_SSE_HADDSUB_RM : InstrItinClass; + +def IIC_SSE_BIT_P_RR : InstrItinClass; +def IIC_SSE_BIT_P_RM : InstrItinClass; + +def IIC_SSE_INTALU_P_RR : InstrItinClass; +def IIC_SSE_INTALU_P_RM : InstrItinClass; +def IIC_SSE_INTALUQ_P_RR : InstrItinClass; +def IIC_SSE_INTALUQ_P_RM : InstrItinClass; + +def IIC_SSE_INTMUL_P_RR : InstrItinClass; +def IIC_SSE_INTMUL_P_RM : InstrItinClass; + +def IIC_SSE_INTSH_P_RR : InstrItinClass; +def IIC_SSE_INTSH_P_RM : InstrItinClass; +def IIC_SSE_INTSH_P_RI : InstrItinClass; + +def IIC_SSE_CMPP_RR : InstrItinClass; +def IIC_SSE_CMPP_RM : InstrItinClass; + +def IIC_SSE_SHUFP : InstrItinClass; +def IIC_SSE_PSHUF : InstrItinClass; + +def IIC_SSE_UNPCK : InstrItinClass; + +def IIC_SSE_MOVMSK : InstrItinClass; +def IIC_SSE_MASKMOV : InstrItinClass; + +def IIC_SSE_PEXTRW : InstrItinClass; +def IIC_SSE_PINSRW : InstrItinClass; + +def IIC_SSE_PABS_RR : InstrItinClass; +def IIC_SSE_PABS_RM : InstrItinClass; + +def IIC_SSE_SQRTP_RR : InstrItinClass; +def IIC_SSE_SQRTP_RM : InstrItinClass; +def IIC_SSE_SQRTS_RR : InstrItinClass; +def IIC_SSE_SQRTS_RM : InstrItinClass; + +def IIC_SSE_RCPP_RR : InstrItinClass; +def IIC_SSE_RCPP_RM : InstrItinClass; +def IIC_SSE_RCPS_RR : InstrItinClass; +def IIC_SSE_RCPS_RM : InstrItinClass; + +def IIC_SSE_MOV_S_RR : InstrItinClass; +def IIC_SSE_MOV_S_RM : InstrItinClass; +def IIC_SSE_MOV_S_MR : InstrItinClass; + +def IIC_SSE_MOVA_P_RR : InstrItinClass; +def IIC_SSE_MOVA_P_RM : InstrItinClass; +def IIC_SSE_MOVA_P_MR : InstrItinClass; + +def IIC_SSE_MOVU_P_RR : InstrItinClass; +def IIC_SSE_MOVU_P_RM : InstrItinClass; +def IIC_SSE_MOVU_P_MR : InstrItinClass; + +def IIC_SSE_MOVDQ : InstrItinClass; +def IIC_SSE_MOVD_ToGP : InstrItinClass; +def IIC_SSE_MOVQ_RR : InstrItinClass; + +def IIC_SSE_MOV_LH : InstrItinClass; + +def IIC_SSE_LDDQU : InstrItinClass; + +def IIC_SSE_MOVNT : InstrItinClass; + +def IIC_SSE_PHADDSUBD_RR : InstrItinClass; +def IIC_SSE_PHADDSUBD_RM : InstrItinClass; +def IIC_SSE_PHADDSUBSW_RR : InstrItinClass; +def IIC_SSE_PHADDSUBSW_RM : InstrItinClass; +def IIC_SSE_PHADDSUBW_RR : InstrItinClass; +def IIC_SSE_PHADDSUBW_RM : InstrItinClass; +def IIC_SSE_PSHUFB_RR : InstrItinClass; +def IIC_SSE_PSHUFB_RM : InstrItinClass; +def IIC_SSE_PSIGN_RR : InstrItinClass; +def IIC_SSE_PSIGN_RM : InstrItinClass; + +def IIC_SSE_PMADD : InstrItinClass; +def IIC_SSE_PMULHRSW : InstrItinClass; +def IIC_SSE_PALIGNR : InstrItinClass; +def IIC_SSE_MWAIT : InstrItinClass; +def IIC_SSE_MONITOR : InstrItinClass; + +def IIC_SSE_PREFETCH : InstrItinClass; +def IIC_SSE_PAUSE : InstrItinClass; +def IIC_SSE_LFENCE : InstrItinClass; +def IIC_SSE_MFENCE : InstrItinClass; +def IIC_SSE_SFENCE : InstrItinClass; +def IIC_SSE_LDMXCSR : InstrItinClass; +def IIC_SSE_STMXCSR : InstrItinClass; + +def IIC_SSE_CVT_PD_RR : InstrItinClass; +def IIC_SSE_CVT_PD_RM : InstrItinClass; +def IIC_SSE_CVT_PS_RR : InstrItinClass; +def IIC_SSE_CVT_PS_RM : InstrItinClass; +def IIC_SSE_CVT_PI2PS_RR : InstrItinClass; +def IIC_SSE_CVT_PI2PS_RM : InstrItinClass; +def IIC_SSE_CVT_Scalar_RR : InstrItinClass; +def IIC_SSE_CVT_Scalar_RM : InstrItinClass; +def IIC_SSE_CVT_SS2SI32_RM : InstrItinClass; +def IIC_SSE_CVT_SS2SI32_RR : InstrItinClass; +def IIC_SSE_CVT_SS2SI64_RM : InstrItinClass; +def IIC_SSE_CVT_SS2SI64_RR : InstrItinClass; +def IIC_SSE_CVT_SD2SI_RM : InstrItinClass; +def IIC_SSE_CVT_SD2SI_RR : InstrItinClass; + +// MMX +def IIC_MMX_MOV_MM_RM : InstrItinClass; +def IIC_MMX_MOV_REG_MM : InstrItinClass; +def IIC_MMX_MOVQ_RM : InstrItinClass; +def IIC_MMX_MOVQ_RR : InstrItinClass; + +def IIC_MMX_ALU_RM : InstrItinClass; +def IIC_MMX_ALU_RR : InstrItinClass; +def IIC_MMX_ALUQ_RM : InstrItinClass; +def IIC_MMX_ALUQ_RR : InstrItinClass; +def IIC_MMX_PHADDSUBW_RM : InstrItinClass; +def IIC_MMX_PHADDSUBW_RR : InstrItinClass; +def IIC_MMX_PHADDSUBD_RM : InstrItinClass; +def IIC_MMX_PHADDSUBD_RR : InstrItinClass; +def IIC_MMX_PMUL : InstrItinClass; +def IIC_MMX_MISC_FUNC_MEM : InstrItinClass; +def IIC_MMX_MISC_FUNC_REG : InstrItinClass; +def IIC_MMX_PSADBW : InstrItinClass; +def IIC_MMX_SHIFT_RI : InstrItinClass; +def IIC_MMX_SHIFT_RM : InstrItinClass; +def IIC_MMX_SHIFT_RR : InstrItinClass; +def IIC_MMX_UNPCK_H_RM : InstrItinClass; +def IIC_MMX_UNPCK_H_RR : InstrItinClass; +def IIC_MMX_UNPCK_L : InstrItinClass; +def IIC_MMX_PCK_RM : InstrItinClass; +def IIC_MMX_PCK_RR : InstrItinClass; +def IIC_MMX_PSHUF : InstrItinClass; +def IIC_MMX_PEXTR : InstrItinClass; +def IIC_MMX_PINSRW : InstrItinClass; +def IIC_MMX_MASKMOV : InstrItinClass; + +def IIC_MMX_CVT_PD_RR : InstrItinClass; +def IIC_MMX_CVT_PD_RM : InstrItinClass; +def IIC_MMX_CVT_PS_RR : InstrItinClass; +def IIC_MMX_CVT_PS_RM : InstrItinClass; + +def IIC_CMPX_LOCK : InstrItinClass; +def IIC_CMPX_LOCK_8 : InstrItinClass; +def IIC_CMPX_LOCK_8B : InstrItinClass; +def IIC_CMPX_LOCK_16B : InstrItinClass; + +def IIC_XADD_LOCK_MEM : InstrItinClass; +def IIC_XADD_LOCK_MEM8 : InstrItinClass; + +def IIC_FILD : InstrItinClass; +def IIC_FLD : InstrItinClass; +def IIC_FLD80 : InstrItinClass; +def IIC_FST : InstrItinClass; +def IIC_FST80 : InstrItinClass; +def IIC_FIST : InstrItinClass; +def IIC_FLDZ : InstrItinClass; +def IIC_FUCOM : InstrItinClass; +def IIC_FUCOMI : InstrItinClass; +def IIC_FCOMI : InstrItinClass; +def IIC_FNSTSW : InstrItinClass; +def IIC_FNSTCW : InstrItinClass; +def IIC_FLDCW : InstrItinClass; +def IIC_FNINIT : InstrItinClass; +def IIC_FFREE : InstrItinClass; +def IIC_FNCLEX : InstrItinClass; +def IIC_WAIT : InstrItinClass; +def IIC_FXAM : InstrItinClass; +def IIC_FNOP : InstrItinClass; +def IIC_FLDL : InstrItinClass; +def IIC_F2XM1 : InstrItinClass; +def IIC_FYL2X : InstrItinClass; +def IIC_FPTAN : InstrItinClass; +def IIC_FPATAN : InstrItinClass; +def IIC_FXTRACT : InstrItinClass; +def IIC_FPREM1 : InstrItinClass; +def IIC_FPSTP : InstrItinClass; +def IIC_FPREM : InstrItinClass; +def IIC_FYL2XP1 : InstrItinClass; +def IIC_FSINCOS : InstrItinClass; +def IIC_FRNDINT : InstrItinClass; +def IIC_FSCALE : InstrItinClass; +def IIC_FCOMPP : InstrItinClass; +def IIC_FXSAVE : InstrItinClass; +def IIC_FXRSTOR : InstrItinClass; + +def IIC_FXCH : InstrItinClass; + +// System instructions +def IIC_CPUID : InstrItinClass; +def IIC_INT : InstrItinClass; +def IIC_INT3 : InstrItinClass; +def IIC_INVD : InstrItinClass; +def IIC_INVLPG : InstrItinClass; +def IIC_IRET : InstrItinClass; +def IIC_HLT : InstrItinClass; +def IIC_LXS : InstrItinClass; +def IIC_LTR : InstrItinClass; +def IIC_RDTSC : InstrItinClass; +def IIC_RSM : InstrItinClass; +def IIC_SIDT : InstrItinClass; +def IIC_SGDT : InstrItinClass; +def IIC_SLDT : InstrItinClass; +def IIC_STR : InstrItinClass; +def IIC_SWAPGS : InstrItinClass; +def IIC_SYSCALL : InstrItinClass; +def IIC_SYS_ENTER_EXIT : InstrItinClass; +def IIC_IN_RR : InstrItinClass; +def IIC_IN_RI : InstrItinClass; +def IIC_OUT_RR : InstrItinClass; +def IIC_OUT_IR : InstrItinClass; +def IIC_INS : InstrItinClass; +def IIC_MOV_REG_DR : InstrItinClass; +def IIC_MOV_DR_REG : InstrItinClass; +def IIC_MOV_REG_CR : InstrItinClass; +def IIC_MOV_CR_REG : InstrItinClass; +def IIC_MOV_REG_SR : InstrItinClass; +def IIC_MOV_MEM_SR : InstrItinClass; +def IIC_MOV_SR_REG : InstrItinClass; +def IIC_MOV_SR_MEM : InstrItinClass; +def IIC_LAR_RM : InstrItinClass; +def IIC_LAR_RR : InstrItinClass; +def IIC_LSL_RM : InstrItinClass; +def IIC_LSL_RR : InstrItinClass; +def IIC_LGDT : InstrItinClass; +def IIC_LIDT : InstrItinClass; +def IIC_LLDT_REG : InstrItinClass; +def IIC_LLDT_MEM : InstrItinClass; +def IIC_PUSH_CS : InstrItinClass; +def IIC_PUSH_SR : InstrItinClass; +def IIC_POP_SR : InstrItinClass; +def IIC_POP_SR_SS : InstrItinClass; +def IIC_VERR : InstrItinClass; +def IIC_VERW_REG : InstrItinClass; +def IIC_VERW_MEM : InstrItinClass; +def IIC_WRMSR : InstrItinClass; +def IIC_RDMSR : InstrItinClass; +def IIC_RDPMC : InstrItinClass; +def IIC_SMSW : InstrItinClass; +def IIC_LMSW_REG : InstrItinClass; +def IIC_LMSW_MEM : InstrItinClass; +def IIC_ENTER : InstrItinClass; +def IIC_LEAVE : InstrItinClass; +def IIC_POP_MEM : InstrItinClass; +def IIC_POP_REG16 : InstrItinClass; +def IIC_POP_REG : InstrItinClass; +def IIC_POP_F : InstrItinClass; +def IIC_POP_FD : InstrItinClass; +def IIC_POP_A : InstrItinClass; +def IIC_PUSH_IMM : InstrItinClass; +def IIC_PUSH_MEM : InstrItinClass; +def IIC_PUSH_REG : InstrItinClass; +def IIC_PUSH_F : InstrItinClass; +def IIC_PUSH_A : InstrItinClass; +def IIC_BSWAP : InstrItinClass; +def IIC_BSF : InstrItinClass; +def IIC_BSR : InstrItinClass; +def IIC_MOVS : InstrItinClass; +def IIC_STOS : InstrItinClass; +def IIC_SCAS : InstrItinClass; +def IIC_CMPS : InstrItinClass; +def IIC_MOV : InstrItinClass; +def IIC_MOV_MEM : InstrItinClass; +def IIC_AHF : InstrItinClass; +def IIC_BT_MI : InstrItinClass; +def IIC_BT_MR : InstrItinClass; +def IIC_BT_RI : InstrItinClass; +def IIC_BT_RR : InstrItinClass; +def IIC_BTX_MI : InstrItinClass; +def IIC_BTX_MR : InstrItinClass; +def IIC_BTX_RI : InstrItinClass; +def IIC_BTX_RR : InstrItinClass; +def IIC_XCHG_REG : InstrItinClass; +def IIC_XCHG_MEM : InstrItinClass; +def IIC_XADD_REG : InstrItinClass; +def IIC_XADD_MEM : InstrItinClass; +def IIC_CMPXCHG_MEM : InstrItinClass; +def IIC_CMPXCHG_REG : InstrItinClass; +def IIC_CMPXCHG_MEM8 : InstrItinClass; +def IIC_CMPXCHG_REG8 : InstrItinClass; +def IIC_CMPXCHG_8B : InstrItinClass; +def IIC_CMPXCHG_16B : InstrItinClass; +def IIC_LODS : InstrItinClass; +def IIC_OUTS : InstrItinClass; +def IIC_CLC : InstrItinClass; +def IIC_CLD : InstrItinClass; +def IIC_CLI : InstrItinClass; +def IIC_CMC : InstrItinClass; +def IIC_CLTS : InstrItinClass; +def IIC_STC : InstrItinClass; +def IIC_STI : InstrItinClass; +def IIC_STD : InstrItinClass; +def IIC_XLAT : InstrItinClass; +def IIC_AAA : InstrItinClass; +def IIC_AAD : InstrItinClass; +def IIC_AAM : InstrItinClass; +def IIC_AAS : InstrItinClass; +def IIC_DAA : InstrItinClass; +def IIC_DAS : InstrItinClass; +def IIC_BOUND : InstrItinClass; +def IIC_ARPL_REG : InstrItinClass; +def IIC_ARPL_MEM : InstrItinClass; +def IIC_MOVBE : InstrItinClass; + +def IIC_NOP : InstrItinClass; + +//===----------------------------------------------------------------------===// +// Processor instruction itineraries. + +// IssueWidth is analagous to the number of decode units. Core and its +// descendents, including Nehalem and SandyBridge have 4 decoders. +// Resources beyond the decoder operate on micro-ops and are bufferred +// so adjacent micro-ops don't directly compete. +// +// MinLatency=0 indicates that RAW dependencies can be decoded in the +// same cycle. +// +// HighLatency=10 is optimistic. X86InstrInfo::isHighLatencyDef +// indicates high latency opcodes. Alternatively, InstrItinData +// entries may be included here to define specific operand +// latencies. Since these latencies are not used for pipeline hazards, +// they do not need to be exact. +// +// ILPWindow=10 is an arbitrary threshold that approximates cycles of +// latency hidden by instruction buffers. The actual value is not very +// important but should be zero for inorder and nonzero for OOO processors. +// +// The GenericModel contains no instruciton itineraries. +def GenericModel : SchedMachineModel { + let IssueWidth = 4; + let MinLatency = 0; + let LoadLatency = 4; + let HighLatency = 10; + let ILPWindow = 10; +} + +include "X86ScheduleAtom.td" +include "X86SchedSandyBridge.td" +include "X86SchedHaswell.td" diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td new file mode 100644 index 0000000..cce8f1b --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td @@ -0,0 +1,530 @@ +//===- X86ScheduleAtom.td - X86 Atom Scheduling Definitions -*- tablegen -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the Intel Atom (Bonnell) +// processors. +// +//===----------------------------------------------------------------------===// + +// +// Scheduling information derived from the "Intel 64 and IA32 Architectures +// Optimization Reference Manual", Chapter 13, Section 4. +// Functional Units +// Port 0 +def Port0 : FuncUnit; // ALU: ALU0, shift/rotate, load/store + // SIMD/FP: SIMD ALU, Shuffle,SIMD/FP multiply, divide +def Port1 : FuncUnit; // ALU: ALU1, bit processing, jump, and LEA + // SIMD/FP: SIMD ALU, FP Adder + +def AtomItineraries : ProcessorItineraries< + [ Port0, Port1 ], + [], [ + // P0 only + // InstrItinData<class, [InstrStage<N, [P0]>] >, + // P0 or P1 + // InstrItinData<class, [InstrStage<N, [P0, P1]>] >, + // P0 and P1 + // InstrItinData<class, [InstrStage<N, [P0], 0>, InstrStage<N, [P1]>] >, + // + // Default is 1 cycle, port0 or port1 + InstrItinData<IIC_ALU_MEM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_ALU_NONMEM, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_LEA, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_LEA_16, [InstrStage<2, [Port0, Port1]>] >, + // mul + InstrItinData<IIC_MUL8, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_MUL16_MEM, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_MUL16_REG, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_MUL32_MEM, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_MUL32_REG, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_MUL64, [InstrStage<12, [Port0, Port1]>] >, + // imul by al, ax, eax, rax + InstrItinData<IIC_IMUL8, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL16_MEM, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL16_REG, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL32_MEM, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL32_REG, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL64, [InstrStage<12, [Port0, Port1]>] >, + // imul reg by reg|mem + InstrItinData<IIC_IMUL16_RM, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL16_RR, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL32_RM, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_IMUL32_RR, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_IMUL64_RM, [InstrStage<12, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL64_RR, [InstrStage<12, [Port0, Port1]>] >, + // imul reg = reg/mem * imm + InstrItinData<IIC_IMUL16_RRI, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL32_RRI, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_IMUL64_RRI, [InstrStage<14, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL16_RMI, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL32_RMI, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_IMUL64_RMI, [InstrStage<14, [Port0, Port1]>] >, + // idiv + InstrItinData<IIC_IDIV8, [InstrStage<62, [Port0, Port1]>] >, + InstrItinData<IIC_IDIV16, [InstrStage<62, [Port0, Port1]>] >, + InstrItinData<IIC_IDIV32, [InstrStage<62, [Port0, Port1]>] >, + InstrItinData<IIC_IDIV64, [InstrStage<130, [Port0, Port1]>] >, + // div + InstrItinData<IIC_DIV8_REG, [InstrStage<50, [Port0, Port1]>] >, + InstrItinData<IIC_DIV8_MEM, [InstrStage<68, [Port0, Port1]>] >, + InstrItinData<IIC_DIV16, [InstrStage<50, [Port0, Port1]>] >, + InstrItinData<IIC_DIV32, [InstrStage<50, [Port0, Port1]>] >, + InstrItinData<IIC_DIV64, [InstrStage<130, [Port0, Port1]>] >, + // neg/not/inc/dec + InstrItinData<IIC_UNARY_REG, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [Port0]>] >, + // add/sub/and/or/xor/adc/sbc/cmp/test + InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_BIN_MEM, [InstrStage<1, [Port0]>] >, + // shift/rotate + InstrItinData<IIC_SR, [InstrStage<1, [Port0]>] >, + // shift double + InstrItinData<IIC_SHD16_REG_IM, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_SHD16_REG_CL, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_SHD16_MEM_IM, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_SHD16_MEM_CL, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_SHD32_REG_IM, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_SHD32_REG_CL, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_SHD32_MEM_IM, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_SHD32_MEM_CL, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_SHD64_REG_IM, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SHD64_REG_CL, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_SHD64_MEM_IM, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SHD64_MEM_CL, [InstrStage<9, [Port0, Port1]>] >, + // cmov + InstrItinData<IIC_CMOV16_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_CMOV16_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_CMOV32_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_CMOV32_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_CMOV64_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_CMOV64_RR, [InstrStage<1, [Port0, Port1]>] >, + // set + InstrItinData<IIC_SET_M, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_SET_R, [InstrStage<1, [Port0, Port1]>] >, + // jcc + InstrItinData<IIC_Jcc, [InstrStage<1, [Port1]>] >, + // jcxz/jecxz/jrcxz + InstrItinData<IIC_JCXZ, [InstrStage<4, [Port0, Port1]>] >, + // jmp rel + InstrItinData<IIC_JMP_REL, [InstrStage<1, [Port1]>] >, + // jmp indirect + InstrItinData<IIC_JMP_REG, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_JMP_MEM, [InstrStage<2, [Port0, Port1]>] >, + // jmp far + InstrItinData<IIC_JMP_FAR_MEM, [InstrStage<32, [Port0, Port1]>] >, + InstrItinData<IIC_JMP_FAR_PTR, [InstrStage<31, [Port0, Port1]>] >, + // loop/loope/loopne + InstrItinData<IIC_LOOP, [InstrStage<18, [Port0, Port1]>] >, + InstrItinData<IIC_LOOPE, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_LOOPNE, [InstrStage<17, [Port0, Port1]>] >, + // call - all but reg/imm + InstrItinData<IIC_CALL_RI, [InstrStage<1, [Port0], 0>, + InstrStage<1, [Port1]>] >, + InstrItinData<IIC_CALL_MEM, [InstrStage<15, [Port0, Port1]>] >, + InstrItinData<IIC_CALL_FAR_MEM, [InstrStage<40, [Port0, Port1]>] >, + InstrItinData<IIC_CALL_FAR_PTR, [InstrStage<39, [Port0, Port1]>] >, + //ret + InstrItinData<IIC_RET, [InstrStage<79, [Port0, Port1]>] >, + InstrItinData<IIC_RET_IMM, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >, + //sign extension movs + InstrItinData<IIC_MOVSX,[InstrStage<1, [Port0] >] >, + InstrItinData<IIC_MOVSX_R16_R8, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_MOVSX_R16_M8, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_MOVSX_R16_R16, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_MOVSX_R32_R32, [InstrStage<1, [Port0, Port1]>] >, + //zero extension movs + InstrItinData<IIC_MOVZX,[InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MOVZX_R16_R8, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_MOVZX_R16_M8, [InstrStage<3, [Port0, Port1]>] >, + + InstrItinData<IIC_REP_MOVS, [InstrStage<75, [Port0, Port1]>] >, + InstrItinData<IIC_REP_STOS, [InstrStage<74, [Port0, Port1]>] >, + + // SSE binary operations + // arithmetic fp scalar + InstrItinData<IIC_SSE_ALU_F32S_RR, [InstrStage<5, [Port1]>] >, + InstrItinData<IIC_SSE_ALU_F32S_RM, [InstrStage<5, [Port0], 0>, + InstrStage<5, [Port1]>] >, + InstrItinData<IIC_SSE_ALU_F64S_RR, [InstrStage<5, [Port1]>] >, + InstrItinData<IIC_SSE_ALU_F64S_RM, [InstrStage<5, [Port0], 0>, + InstrStage<5, [Port1]>] >, + InstrItinData<IIC_SSE_MUL_F32S_RR, [InstrStage<4, [Port0]>] >, + InstrItinData<IIC_SSE_MUL_F32S_RM, [InstrStage<4, [Port0]>] >, + InstrItinData<IIC_SSE_MUL_F64S_RR, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_SSE_MUL_F64S_RM, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_SSE_DIV_F32S_RR, [InstrStage<34, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_DIV_F32S_RM, [InstrStage<34, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_DIV_F64S_RR, [InstrStage<62, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_DIV_F64S_RM, [InstrStage<62, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_COMIS_RR, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_COMIS_RM, [InstrStage<10, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_HADDSUB_RR, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_HADDSUB_RM, [InstrStage<9, [Port0, Port1]>] >, + + // arithmetic fp parallel + InstrItinData<IIC_SSE_ALU_F32P_RR, [InstrStage<5, [Port1]>] >, + InstrItinData<IIC_SSE_ALU_F32P_RM, [InstrStage<5, [Port0], 0>, + InstrStage<5, [Port1]>] >, + InstrItinData<IIC_SSE_ALU_F64P_RR, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_ALU_F64P_RM, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MUL_F32P_RR, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_SSE_MUL_F32P_RM, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_SSE_MUL_F64P_RR, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MUL_F64P_RM, [InstrStage<10, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_DIV_F32P_RR, [InstrStage<70, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_DIV_F32P_RM, [InstrStage<70, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_DIV_F64P_RR, [InstrStage<125, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_DIV_F64P_RM, [InstrStage<125, [Port0, Port1]>] >, + + // bitwise parallel + InstrItinData<IIC_SSE_BIT_P_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_BIT_P_RM, [InstrStage<1, [Port0]>] >, + + // arithmetic int parallel + InstrItinData<IIC_SSE_INTALU_P_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_INTALU_P_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_INTALUQ_P_RR, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_INTALUQ_P_RM, [InstrStage<3, [Port0, Port1]>] >, + + // multiply int parallel + InstrItinData<IIC_SSE_INTMUL_P_RR, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_SSE_INTMUL_P_RM, [InstrStage<5, [Port0]>] >, + + // shift parallel + InstrItinData<IIC_SSE_INTSH_P_RR, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_CMPP_RR, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CMPP_RM, [InstrStage<7, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_PSHUF, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_SQRTP_RR, [InstrStage<13, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTP_RM, [InstrStage<14, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTS_RR, [InstrStage<11, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTS_RM, [InstrStage<12, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >, + InstrItinData<IIC_SSE_RCPS_RM, [InstrStage<4, [Port0]>] >, + + InstrItinData<IIC_SSE_MOVMSK, [InstrStage<3, [Port0]>] >, + InstrItinData<IIC_SSE_MASKMOV, [InstrStage<2, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_PEXTRW, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PINSRW, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_PABS_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PABS_RM, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_MOV_S_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MOV_S_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_MOV_S_MR, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_MOVA_P_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MOVA_P_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_MOVA_P_MR, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_MOVU_P_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MOVU_P_RM, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MOVU_P_MR, [InstrStage<2, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_MOV_LH, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_LDDQU, [InstrStage<3, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_MOVDQ, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_MOVD_ToGP, [InstrStage<3, [Port0]>] >, + InstrItinData<IIC_SSE_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_MOVNT, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_PREFETCH, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_PAUSE, [InstrStage<17, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_LFENCE, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MFENCE, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_SFENCE, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_LDMXCSR, [InstrStage<5, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_STMXCSR, [InstrStage<15, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PHADDSUBSW_RR, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PHADDSUBSW_RM, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PHADDSUBW_RR, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PHADDSUBW_RM, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PSHUFB_RR, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PSHUFB_RM, [InstrStage<5, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PSIGN_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PSIGN_RM, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_SSE_PALIGNR, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_MWAIT, [InstrStage<46, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MONITOR, [InstrStage<45, [Port0, Port1]>] >, + + // conversions + // to/from PD ... + InstrItinData<IIC_SSE_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >, + // to/from PS except to/from PD and PS2PI + InstrItinData<IIC_SSE_CVT_PS_RR, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_PS_RM, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_Scalar_RR, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_Scalar_RM, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_SS2SI32_RR, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_SS2SI32_RM, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_SS2SI64_RR, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_SS2SI64_RM, [InstrStage<10, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<9, [Port0, Port1]>] >, + + // MMX MOVs + InstrItinData<IIC_MMX_MOV_MM_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_MOV_REG_MM, [InstrStage<3, [Port0]>] >, + InstrItinData<IIC_MMX_MOVQ_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >, + // other MMX + InstrItinData<IIC_MMX_ALU_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_ALU_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_ALUQ_RM, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_ALUQ_RR, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PHADDSUBW_RM, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PHADDSUBW_RR, [InstrStage<5, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PMUL, [InstrStage<4, [Port0]>] >, + InstrItinData<IIC_MMX_MISC_FUNC_MEM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_MISC_FUNC_REG, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PSADBW, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_SHIFT_RI, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_SHIFT_RM, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_SHIFT_RR, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_UNPCK_H_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_UNPCK_H_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_UNPCK_L, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_PCK_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_PCK_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PSHUF, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_PEXTR, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PINSRW, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [Port0]>] >, + // conversions + // from/to PD + InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >, + // from/to PI + InstrItinData<IIC_MMX_CVT_PS_RR, [InstrStage<5, [Port1]>] >, + InstrItinData<IIC_MMX_CVT_PS_RM, [InstrStage<5, [Port0], 0>, + InstrStage<5, [Port1]>]>, + + InstrItinData<IIC_CMPX_LOCK, [InstrStage<14, [Port0, Port1]>] >, + InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<18, [Port0, Port1]>] >, + InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<22, [Port0, Port1]>] >, + + InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<3, [Port0, Port1]>] >, + + InstrItinData<IIC_FILD, [InstrStage<5, [Port0], 0>, InstrStage<5, [Port1]>] >, + InstrItinData<IIC_FLD, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_FLD80, [InstrStage<4, [Port0, Port1]>] >, + + InstrItinData<IIC_FST, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_FST80, [InstrStage<5, [Port0, Port1]>] >, + InstrItinData<IIC_FIST, [InstrStage<6, [Port0, Port1]>] >, + + InstrItinData<IIC_FLDZ, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_FUCOM, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_FUCOMI, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_FCOMI, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_FNSTSW, [InstrStage<10, [Port0, Port1]>] >, + InstrItinData<IIC_FNSTCW, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_FLDCW, [InstrStage<5, [Port0, Port1]>] >, + InstrItinData<IIC_FNINIT, [InstrStage<63, [Port0, Port1]>] >, + InstrItinData<IIC_FFREE, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_FNCLEX, [InstrStage<25, [Port0, Port1]>] >, + InstrItinData<IIC_WAIT, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_FXAM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_FNOP, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_FLDL, [InstrStage<10, [Port0, Port1]>] >, + InstrItinData<IIC_F2XM1, [InstrStage<99, [Port0, Port1]>] >, + InstrItinData<IIC_FYL2X, [InstrStage<146, [Port0, Port1]>] >, + InstrItinData<IIC_FPTAN, [InstrStage<168, [Port0, Port1]>] >, + InstrItinData<IIC_FPATAN, [InstrStage<183, [Port0, Port1]>] >, + InstrItinData<IIC_FXTRACT, [InstrStage<25, [Port0, Port1]>] >, + InstrItinData<IIC_FPREM1, [InstrStage<71, [Port0, Port1]>] >, + InstrItinData<IIC_FPSTP, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_FPREM, [InstrStage<55, [Port0, Port1]>] >, + InstrItinData<IIC_FYL2XP1, [InstrStage<147, [Port0, Port1]>] >, + InstrItinData<IIC_FSINCOS, [InstrStage<174, [Port0, Port1]>] >, + InstrItinData<IIC_FRNDINT, [InstrStage<46, [Port0, Port1]>] >, + InstrItinData<IIC_FSCALE, [InstrStage<77, [Port0, Port1]>] >, + InstrItinData<IIC_FCOMPP, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_FXSAVE, [InstrStage<140, [Port0, Port1]>] >, + InstrItinData<IIC_FXRSTOR, [InstrStage<141, [Port0, Port1]>] >, + InstrItinData<IIC_FXCH, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >, + + // System instructions + InstrItinData<IIC_CPUID, [InstrStage<121, [Port0, Port1]>] >, + InstrItinData<IIC_INT, [InstrStage<127, [Port0, Port1]>] >, + InstrItinData<IIC_INT3, [InstrStage<130, [Port0, Port1]>] >, + InstrItinData<IIC_INVD, [InstrStage<1003, [Port0, Port1]>] >, + InstrItinData<IIC_INVLPG, [InstrStage<71, [Port0, Port1]>] >, + InstrItinData<IIC_IRET, [InstrStage<109, [Port0, Port1]>] >, + InstrItinData<IIC_HLT, [InstrStage<121, [Port0, Port1]>] >, + InstrItinData<IIC_LXS, [InstrStage<10, [Port0, Port1]>] >, + InstrItinData<IIC_LTR, [InstrStage<83, [Port0, Port1]>] >, + InstrItinData<IIC_RDTSC, [InstrStage<30, [Port0, Port1]>] >, + InstrItinData<IIC_RSM, [InstrStage<741, [Port0, Port1]>] >, + InstrItinData<IIC_SIDT, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_SGDT, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_SLDT, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_STR, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_SWAPGS, [InstrStage<22, [Port0, Port1]>] >, + InstrItinData<IIC_SYSCALL, [InstrStage<96, [Port0, Port1]>] >, + InstrItinData<IIC_SYS_ENTER_EXIT, [InstrStage<88, [Port0, Port1]>] >, + + InstrItinData<IIC_IN_RR, [InstrStage<94, [Port0, Port1]>] >, + InstrItinData<IIC_IN_RI, [InstrStage<92, [Port0, Port1]>] >, + InstrItinData<IIC_OUT_RR, [InstrStage<68, [Port0, Port1]>] >, + InstrItinData<IIC_OUT_IR, [InstrStage<72, [Port0, Port1]>] >, + InstrItinData<IIC_INS, [InstrStage<59, [Port0, Port1]>] >, + + InstrItinData<IIC_MOV_REG_DR, [InstrStage<88, [Port0, Port1]>] >, + InstrItinData<IIC_MOV_DR_REG, [InstrStage<123, [Port0, Port1]>] >, + // worst case for mov REG_CRx + InstrItinData<IIC_MOV_REG_CR, [InstrStage<12, [Port0, Port1]>] >, + InstrItinData<IIC_MOV_CR_REG, [InstrStage<136, [Port0, Port1]>] >, + + InstrItinData<IIC_MOV_REG_SR, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MOV_MEM_SR, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_MOV_SR_REG, [InstrStage<21, [Port0, Port1]>] >, + InstrItinData<IIC_MOV_SR_MEM, [InstrStage<26, [Port0, Port1]>] >, + // LAR + InstrItinData<IIC_LAR_RM, [InstrStage<50, [Port0, Port1]>] >, + InstrItinData<IIC_LAR_RR, [InstrStage<54, [Port0, Port1]>] >, + // LSL + InstrItinData<IIC_LSL_RM, [InstrStage<46, [Port0, Port1]>] >, + InstrItinData<IIC_LSL_RR, [InstrStage<49, [Port0, Port1]>] >, + + InstrItinData<IIC_LGDT, [InstrStage<44, [Port0, Port1]>] >, + InstrItinData<IIC_LIDT, [InstrStage<44, [Port0, Port1]>] >, + InstrItinData<IIC_LLDT_REG, [InstrStage<60, [Port0, Port1]>] >, + InstrItinData<IIC_LLDT_MEM, [InstrStage<64, [Port0, Port1]>] >, + // push control register, segment registers + InstrItinData<IIC_PUSH_CS, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_PUSH_SR, [InstrStage<2, [Port0, Port1]>] >, + // pop control register, segment registers + InstrItinData<IIC_POP_SR, [InstrStage<29, [Port0, Port1]>] >, + InstrItinData<IIC_POP_SR_SS, [InstrStage<48, [Port0, Port1]>] >, + // VERR, VERW + InstrItinData<IIC_VERR, [InstrStage<41, [Port0, Port1]>] >, + InstrItinData<IIC_VERW_REG, [InstrStage<51, [Port0, Port1]>] >, + InstrItinData<IIC_VERW_MEM, [InstrStage<50, [Port0, Port1]>] >, + // WRMSR, RDMSR + InstrItinData<IIC_WRMSR, [InstrStage<202, [Port0, Port1]>] >, + InstrItinData<IIC_RDMSR, [InstrStage<78, [Port0, Port1]>] >, + InstrItinData<IIC_RDPMC, [InstrStage<46, [Port0, Port1]>] >, + // SMSW, LMSW + InstrItinData<IIC_SMSW, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_LMSW_REG, [InstrStage<69, [Port0, Port1]>] >, + InstrItinData<IIC_LMSW_MEM, [InstrStage<67, [Port0, Port1]>] >, + + InstrItinData<IIC_ENTER, [InstrStage<32, [Port0, Port1]>] >, + InstrItinData<IIC_LEAVE, [InstrStage<2, [Port0, Port1]>] >, + + InstrItinData<IIC_POP_MEM, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_POP_REG16, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_POP_REG, [InstrStage<1, [Port0], 0>, + InstrStage<1, [Port1]>] >, + InstrItinData<IIC_POP_F, [InstrStage<32, [Port0, Port1]>] >, + InstrItinData<IIC_POP_FD, [InstrStage<26, [Port0, Port1]>] >, + InstrItinData<IIC_POP_A, [InstrStage<9, [Port0, Port1]>] >, + + InstrItinData<IIC_PUSH_IMM, [InstrStage<1, [Port0], 0>, + InstrStage<1, [Port1]>] >, + InstrItinData<IIC_PUSH_MEM, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_PUSH_REG, [InstrStage<1, [Port0], 0>, + InstrStage<1, [Port1]>] >, + InstrItinData<IIC_PUSH_F, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_PUSH_A, [InstrStage<8, [Port0, Port1]>] >, + + InstrItinData<IIC_BSWAP, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_BSF, [InstrStage<16, [Port0, Port1]>] >, + InstrItinData<IIC_BSR, [InstrStage<16, [Port0, Port1]>] >, + InstrItinData<IIC_MOVS, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_STOS, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SCAS, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_CMPS, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_MOV, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_MOV_MEM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_AHF, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_BT_MI, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_BT_MR, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_BT_RI, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_BT_RR, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_BTX_MI, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_BTX_MR, [InstrStage<11, [Port0, Port1]>] >, + InstrItinData<IIC_BTX_RI, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_BTX_RR, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_XCHG_REG, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_XCHG_MEM, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_XADD_REG, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_XADD_MEM, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_CMPXCHG_MEM, [InstrStage<14, [Port0, Port1]>] >, + InstrItinData<IIC_CMPXCHG_REG, [InstrStage<15, [Port0, Port1]>] >, + InstrItinData<IIC_CMPXCHG_MEM8, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_CMPXCHG_REG8, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_CMPXCHG_8B, [InstrStage<18, [Port0, Port1]>] >, + InstrItinData<IIC_CMPXCHG_16B, [InstrStage<22, [Port0, Port1]>] >, + InstrItinData<IIC_LODS, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_OUTS, [InstrStage<74, [Port0, Port1]>] >, + InstrItinData<IIC_CLC, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_CLD, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_CLI, [InstrStage<14, [Port0, Port1]>] >, + InstrItinData<IIC_CMC, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_CLTS, [InstrStage<33, [Port0, Port1]>] >, + InstrItinData<IIC_STC, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_STI, [InstrStage<17, [Port0, Port1]>] >, + InstrItinData<IIC_STD, [InstrStage<21, [Port0, Port1]>] >, + InstrItinData<IIC_XLAT, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_AAA, [InstrStage<13, [Port0, Port1]>] >, + InstrItinData<IIC_AAD, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_AAM, [InstrStage<21, [Port0, Port1]>] >, + InstrItinData<IIC_AAS, [InstrStage<13, [Port0, Port1]>] >, + InstrItinData<IIC_DAA, [InstrStage<18, [Port0, Port1]>] >, + InstrItinData<IIC_DAS, [InstrStage<20, [Port0, Port1]>] >, + InstrItinData<IIC_BOUND, [InstrStage<11, [Port0, Port1]>] >, + InstrItinData<IIC_ARPL_REG, [InstrStage<24, [Port0, Port1]>] >, + InstrItinData<IIC_ARPL_MEM, [InstrStage<23, [Port0, Port1]>] >, + InstrItinData<IIC_MOVBE, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_NOP, [InstrStage<1, [Port0, Port1]>] > + ]>; + +// Atom machine model. +def AtomModel : SchedMachineModel { + let IssueWidth = 2; // Allows 2 instructions per scheduling group. + let MinLatency = 1; // InstrStage cycles overrides MinLatency. + // OperandCycles may be used for expected latency. + let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles. + let HighLatency = 30;// Expected, may be overriden by OperandCycles. + let ILPWindow = 0; // Always try to hide expected latency. + + let Itineraries = AtomItineraries; +} diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp new file mode 100644 index 0000000..f934fdd --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -0,0 +1,270 @@ +//===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the X86SelectionDAGInfo class. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "x86-selectiondag-info" +#include "X86TargetMachine.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/IR/DerivedTypes.h" +using namespace llvm; + +X86SelectionDAGInfo::X86SelectionDAGInfo(const X86TargetMachine &TM) : + TargetSelectionDAGInfo(TM), + Subtarget(&TM.getSubtarget<X86Subtarget>()), + TLI(*TM.getTargetLowering()) { +} + +X86SelectionDAGInfo::~X86SelectionDAGInfo() { +} + +SDValue +X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, + SDValue Chain, + SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, + bool isVolatile, + MachinePointerInfo DstPtrInfo) const { + ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); + + // If to a segment-relative address space, use the default lowering. + if (DstPtrInfo.getAddrSpace() >= 256) + return SDValue(); + + // If not DWORD aligned or size is more than the threshold, call the library. + // The libc version is likely to be faster for these cases. It can use the + // address value and run time information about the CPU. + if ((Align & 3) != 0 || + !ConstantSize || + ConstantSize->getZExtValue() > + Subtarget->getMaxInlineSizeThreshold()) { + SDValue InFlag(0, 0); + + // Check to see if there is a specialized entry-point for memory zeroing. + ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); + + if (const char *bzeroEntry = V && + V->isNullValue() ? Subtarget->getBZeroEntry() : 0) { + EVT IntPtr = TLI.getPointerTy(); + Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext()); + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + Entry.Node = Dst; + Entry.Ty = IntPtrTy; + Args.push_back(Entry); + Entry.Node = Size; + Args.push_back(Entry); + TargetLowering:: + CallLoweringInfo CLI(Chain, Type::getVoidTy(*DAG.getContext()), + false, false, false, false, + 0, CallingConv::C, /*isTailCall=*/false, + /*doesNotRet=*/false, /*isReturnValueUsed=*/false, + DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, + DAG, dl); + std::pair<SDValue,SDValue> CallResult = + TLI.LowerCallTo(CLI); + return CallResult.second; + } + + // Otherwise have the target-independent code call memset. + return SDValue(); + } + + uint64_t SizeVal = ConstantSize->getZExtValue(); + SDValue InFlag(0, 0); + EVT AVT; + SDValue Count; + ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src); + unsigned BytesLeft = 0; + bool TwoRepStos = false; + if (ValC) { + unsigned ValReg; + uint64_t Val = ValC->getZExtValue() & 255; + + // If the value is a constant, then we can potentially use larger sets. + switch (Align & 3) { + case 2: // WORD aligned + AVT = MVT::i16; + ValReg = X86::AX; + Val = (Val << 8) | Val; + break; + case 0: // DWORD aligned + AVT = MVT::i32; + ValReg = X86::EAX; + Val = (Val << 8) | Val; + Val = (Val << 16) | Val; + if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned + AVT = MVT::i64; + ValReg = X86::RAX; + Val = (Val << 32) | Val; + } + break; + default: // Byte aligned + AVT = MVT::i8; + ValReg = X86::AL; + Count = DAG.getIntPtrConstant(SizeVal); + break; + } + + if (AVT.bitsGT(MVT::i8)) { + unsigned UBytes = AVT.getSizeInBits() / 8; + Count = DAG.getIntPtrConstant(SizeVal / UBytes); + BytesLeft = SizeVal % UBytes; + } + + Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT), + InFlag); + InFlag = Chain.getValue(1); + } else { + AVT = MVT::i8; + Count = DAG.getIntPtrConstant(SizeVal); + Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag); + InFlag = Chain.getValue(1); + } + + Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : + X86::ECX, + Count, InFlag); + InFlag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : + X86::EDI, + Dst, InFlag); + InFlag = Chain.getValue(1); + + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; + Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops)); + + if (TwoRepStos) { + InFlag = Chain.getValue(1); + Count = Size; + EVT CVT = Count.getValueType(); + SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, + DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT)); + Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : + X86::ECX, + Left, InFlag); + InFlag = Chain.getValue(1); + Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag }; + Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops)); + } else if (BytesLeft) { + // Handle the last 1 - 7 bytes. + unsigned Offset = SizeVal - BytesLeft; + EVT AddrVT = Dst.getValueType(); + EVT SizeVT = Size.getValueType(); + + Chain = DAG.getMemset(Chain, dl, + DAG.getNode(ISD::ADD, dl, AddrVT, Dst, + DAG.getConstant(Offset, AddrVT)), + Src, + DAG.getConstant(BytesLeft, SizeVT), + Align, isVolatile, DstPtrInfo.getWithOffset(Offset)); + } + + // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. + return Chain; +} + +SDValue +X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, + SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, + bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const { + // This requires the copy size to be a constant, preferably + // within a subtarget-specific limit. + ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); + if (!ConstantSize) + return SDValue(); + uint64_t SizeVal = ConstantSize->getZExtValue(); + if (!AlwaysInline && SizeVal > Subtarget->getMaxInlineSizeThreshold()) + return SDValue(); + + /// If not DWORD aligned, it is more efficient to call the library. However + /// if calling the library is not allowed (AlwaysInline), then soldier on as + /// the code generated here is better than the long load-store sequence we + /// would otherwise get. + if (!AlwaysInline && (Align & 3) != 0) + return SDValue(); + + // If to a segment-relative address space, use the default lowering. + if (DstPtrInfo.getAddrSpace() >= 256 || + SrcPtrInfo.getAddrSpace() >= 256) + return SDValue(); + + // ESI might be used as a base pointer, in that case we can't simply overwrite + // the register. Fall back to generic code. + const X86RegisterInfo *TRI = + static_cast<const X86RegisterInfo *>(DAG.getTarget().getRegisterInfo()); + if (TRI->hasBasePointer(DAG.getMachineFunction()) && + TRI->getBaseRegister() == X86::ESI) + return SDValue(); + + MVT AVT; + if (Align & 1) + AVT = MVT::i8; + else if (Align & 2) + AVT = MVT::i16; + else if (Align & 4) + // DWORD aligned + AVT = MVT::i32; + else + // QWORD aligned + AVT = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; + + unsigned UBytes = AVT.getSizeInBits() / 8; + unsigned CountVal = SizeVal / UBytes; + SDValue Count = DAG.getIntPtrConstant(CountVal); + unsigned BytesLeft = SizeVal % UBytes; + + SDValue InFlag(0, 0); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : + X86::ECX, + Count, InFlag); + InFlag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : + X86::EDI, + Dst, InFlag); + InFlag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI : + X86::ESI, + Src, InFlag); + InFlag = Chain.getValue(1); + + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; + SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops, + array_lengthof(Ops)); + + SmallVector<SDValue, 4> Results; + Results.push_back(RepMovs); + if (BytesLeft) { + // Handle the last 1 - 7 bytes. + unsigned Offset = SizeVal - BytesLeft; + EVT DstVT = Dst.getValueType(); + EVT SrcVT = Src.getValueType(); + EVT SizeVT = Size.getValueType(); + Results.push_back(DAG.getMemcpy(Chain, dl, + DAG.getNode(ISD::ADD, dl, DstVT, Dst, + DAG.getConstant(Offset, DstVT)), + DAG.getNode(ISD::ADD, dl, SrcVT, Src, + DAG.getConstant(Offset, SrcVT)), + DAG.getConstant(BytesLeft, SizeVT), + Align, isVolatile, AlwaysInline, + DstPtrInfo.getWithOffset(Offset), + SrcPtrInfo.getWithOffset(Offset))); + } + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &Results[0], Results.size()); +} diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h new file mode 100644 index 0000000..d1d66fe --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h @@ -0,0 +1,56 @@ +//===-- X86SelectionDAGInfo.h - X86 SelectionDAG Info -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the X86 subclass for TargetSelectionDAGInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef X86SELECTIONDAGINFO_H +#define X86SELECTIONDAGINFO_H + +#include "llvm/Target/TargetSelectionDAGInfo.h" + +namespace llvm { + +class X86TargetLowering; +class X86TargetMachine; +class X86Subtarget; + +class X86SelectionDAGInfo : public TargetSelectionDAGInfo { + /// Subtarget - Keep a pointer to the X86Subtarget around so that we can + /// make the right decision when generating code for different targets. + const X86Subtarget *Subtarget; + + const X86TargetLowering &TLI; + +public: + explicit X86SelectionDAGInfo(const X86TargetMachine &TM); + ~X86SelectionDAGInfo(); + + virtual + SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, + SDValue Chain, + SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, + bool isVolatile, + MachinePointerInfo DstPtrInfo) const; + + virtual + SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, + SDValue Chain, + SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, + bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const; +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp new file mode 100644 index 0000000..74da2a9 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp @@ -0,0 +1,518 @@ +//===-- X86Subtarget.cpp - X86 Subtarget Information ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the X86 specific subclass of TargetSubtargetInfo. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "subtarget" +#include "X86Subtarget.h" +#include "X86InstrInfo.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Host.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" + +#define GET_SUBTARGETINFO_TARGET_DESC +#define GET_SUBTARGETINFO_CTOR +#include "X86GenSubtargetInfo.inc" + +using namespace llvm; + +#if defined(_MSC_VER) +#include <intrin.h> +#endif + +/// ClassifyBlockAddressReference - Classify a blockaddress reference for the +/// current subtarget according to how we should reference it in a non-pcrel +/// context. +unsigned char X86Subtarget::ClassifyBlockAddressReference() const { + if (isPICStyleGOT()) // 32-bit ELF targets. + return X86II::MO_GOTOFF; + + if (isPICStyleStubPIC()) // Darwin/32 in PIC mode. + return X86II::MO_PIC_BASE_OFFSET; + + // Direct static reference to label. + return X86II::MO_NO_FLAG; +} + +/// ClassifyGlobalReference - Classify a global variable reference for the +/// current subtarget according to how we should reference it in a non-pcrel +/// context. +unsigned char X86Subtarget:: +ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const { + // DLLImport only exists on windows, it is implemented as a load from a + // DLLIMPORT stub. + if (GV->hasDLLImportLinkage()) + return X86II::MO_DLLIMPORT; + + // Determine whether this is a reference to a definition or a declaration. + // Materializable GVs (in JIT lazy compilation mode) do not require an extra + // load from stub. + bool isDecl = GV->hasAvailableExternallyLinkage(); + if (GV->isDeclaration() && !GV->isMaterializable()) + isDecl = true; + + // X86-64 in PIC mode. + if (isPICStyleRIPRel()) { + // Large model never uses stubs. + if (TM.getCodeModel() == CodeModel::Large) + return X86II::MO_NO_FLAG; + + if (isTargetDarwin()) { + // If symbol visibility is hidden, the extra load is not needed if + // target is x86-64 or the symbol is definitely defined in the current + // translation unit. + if (GV->hasDefaultVisibility() && + (isDecl || GV->isWeakForLinker())) + return X86II::MO_GOTPCREL; + } else if (!isTargetWin64()) { + assert(isTargetELF() && "Unknown rip-relative target"); + + // Extra load is needed for all externally visible. + if (!GV->hasLocalLinkage() && GV->hasDefaultVisibility()) + return X86II::MO_GOTPCREL; + } + + return X86II::MO_NO_FLAG; + } + + if (isPICStyleGOT()) { // 32-bit ELF targets. + // Extra load is needed for all externally visible. + if (GV->hasLocalLinkage() || GV->hasHiddenVisibility()) + return X86II::MO_GOTOFF; + return X86II::MO_GOT; + } + + if (isPICStyleStubPIC()) { // Darwin/32 in PIC mode. + // Determine whether we have a stub reference and/or whether the reference + // is relative to the PIC base or not. + + // If this is a strong reference to a definition, it is definitely not + // through a stub. + if (!isDecl && !GV->isWeakForLinker()) + return X86II::MO_PIC_BASE_OFFSET; + + // Unless we have a symbol with hidden visibility, we have to go through a + // normal $non_lazy_ptr stub because this symbol might be resolved late. + if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference. + return X86II::MO_DARWIN_NONLAZY_PIC_BASE; + + // If symbol visibility is hidden, we have a stub for common symbol + // references and external declarations. + if (isDecl || GV->hasCommonLinkage()) { + // Hidden $non_lazy_ptr reference. + return X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE; + } + + // Otherwise, no stub. + return X86II::MO_PIC_BASE_OFFSET; + } + + if (isPICStyleStubNoDynamic()) { // Darwin/32 in -mdynamic-no-pic mode. + // Determine whether we have a stub reference. + + // If this is a strong reference to a definition, it is definitely not + // through a stub. + if (!isDecl && !GV->isWeakForLinker()) + return X86II::MO_NO_FLAG; + + // Unless we have a symbol with hidden visibility, we have to go through a + // normal $non_lazy_ptr stub because this symbol might be resolved late. + if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference. + return X86II::MO_DARWIN_NONLAZY; + + // Otherwise, no stub. + return X86II::MO_NO_FLAG; + } + + // Direct static reference to global. + return X86II::MO_NO_FLAG; +} + + +/// getBZeroEntry - This function returns the name of a function which has an +/// interface like the non-standard bzero function, if such a function exists on +/// the current subtarget and it is considered prefereable over memset with zero +/// passed as the second argument. Otherwise it returns null. +const char *X86Subtarget::getBZeroEntry() const { + // Darwin 10 has a __bzero entry point for this purpose. + if (getTargetTriple().isMacOSX() && + !getTargetTriple().isMacOSXVersionLT(10, 6)) + return "__bzero"; + + return 0; +} + +bool X86Subtarget::hasSinCos() const { + return getTargetTriple().isMacOSX() && + !getTargetTriple().isMacOSXVersionLT(10, 9) && + is64Bit(); +} + +/// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls +/// to immediate address. +bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const { + if (In64BitMode) + return false; + return isTargetELF() || TM.getRelocationModel() == Reloc::Static; +} + +static bool OSHasAVXSupport() { +#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\ + || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64) +#if defined(__GNUC__) + // Check xgetbv; this uses a .byte sequence instead of the instruction + // directly because older assemblers do not include support for xgetbv and + // there is no easy way to conditionally compile based on the assembler used. + int rEAX, rEDX; + __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0)); +#elif defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK) + unsigned long long rEAX = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); +#else + int rEAX = 0; // Ensures we return false +#endif + return (rEAX & 6) == 6; +#else + return false; +#endif +} + +void X86Subtarget::AutoDetectSubtargetFeatures() { + unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0; + unsigned MaxLevel; + union { + unsigned u[3]; + char c[12]; + } text; + + if (X86_MC::GetCpuIDAndInfo(0, &MaxLevel, text.u+0, text.u+2, text.u+1) || + MaxLevel < 1) + return; + + X86_MC::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX); + + if ((EDX >> 15) & 1) { HasCMov = true; ToggleFeature(X86::FeatureCMOV); } + if ((EDX >> 23) & 1) { X86SSELevel = MMX; ToggleFeature(X86::FeatureMMX); } + if ((EDX >> 25) & 1) { X86SSELevel = SSE1; ToggleFeature(X86::FeatureSSE1); } + if ((EDX >> 26) & 1) { X86SSELevel = SSE2; ToggleFeature(X86::FeatureSSE2); } + if (ECX & 0x1) { X86SSELevel = SSE3; ToggleFeature(X86::FeatureSSE3); } + if ((ECX >> 9) & 1) { X86SSELevel = SSSE3; ToggleFeature(X86::FeatureSSSE3);} + if ((ECX >> 19) & 1) { X86SSELevel = SSE41; ToggleFeature(X86::FeatureSSE41);} + if ((ECX >> 20) & 1) { X86SSELevel = SSE42; ToggleFeature(X86::FeatureSSE42);} + if (((ECX >> 27) & 1) && ((ECX >> 28) & 1) && OSHasAVXSupport()) { + X86SSELevel = AVX; ToggleFeature(X86::FeatureAVX); + } + + bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0; + bool IsAMD = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0; + + if ((ECX >> 1) & 0x1) { + HasPCLMUL = true; + ToggleFeature(X86::FeaturePCLMUL); + } + if ((ECX >> 12) & 0x1) { + HasFMA = true; + ToggleFeature(X86::FeatureFMA); + } + if (IsIntel && ((ECX >> 22) & 0x1)) { + HasMOVBE = true; + ToggleFeature(X86::FeatureMOVBE); + } + if ((ECX >> 23) & 0x1) { + HasPOPCNT = true; + ToggleFeature(X86::FeaturePOPCNT); + } + if ((ECX >> 25) & 0x1) { + HasAES = true; + ToggleFeature(X86::FeatureAES); + } + if ((ECX >> 29) & 0x1) { + HasF16C = true; + ToggleFeature(X86::FeatureF16C); + } + if (IsIntel && ((ECX >> 30) & 0x1)) { + HasRDRAND = true; + ToggleFeature(X86::FeatureRDRAND); + } + + if ((ECX >> 13) & 0x1) { + HasCmpxchg16b = true; + ToggleFeature(X86::FeatureCMPXCHG16B); + } + + if (IsIntel || IsAMD) { + // Determine if bit test memory instructions are slow. + unsigned Family = 0; + unsigned Model = 0; + X86_MC::DetectFamilyModel(EAX, Family, Model); + if (IsAMD || (Family == 6 && Model >= 13)) { + IsBTMemSlow = true; + ToggleFeature(X86::FeatureSlowBTMem); + } + + // If it's an Intel chip since Nehalem and not an Atom chip, unaligned + // memory access is fast. We hard code model numbers here because they + // aren't strictly increasing for Intel chips it seems. + if (IsIntel && + ((Family == 6 && Model == 0x1E) || // Nehalem: Clarksfield, Lynnfield, + // Jasper Froest + (Family == 6 && Model == 0x1A) || // Nehalem: Bloomfield, Nehalem-EP + (Family == 6 && Model == 0x2E) || // Nehalem: Nehalem-EX + (Family == 6 && Model == 0x25) || // Westmere: Arrandale, Clarksdale + (Family == 6 && Model == 0x2C) || // Westmere: Gulftown, Westmere-EP + (Family == 6 && Model == 0x2F) || // Westmere: Westmere-EX + (Family == 6 && Model == 0x2A) || // SandyBridge + (Family == 6 && Model == 0x2D) || // SandyBridge: SandyBridge-E* + (Family == 6 && Model == 0x3A))) {// IvyBridge + IsUAMemFast = true; + ToggleFeature(X86::FeatureFastUAMem); + } + + // Set processor type. Currently only Atom is detected. + if (Family == 6 && + (Model == 28 || Model == 38 || Model == 39 + || Model == 53 || Model == 54)) { + X86ProcFamily = IntelAtom; + + UseLeaForSP = true; + ToggleFeature(X86::FeatureLeaForSP); + } + + unsigned MaxExtLevel; + X86_MC::GetCpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX); + + if (MaxExtLevel >= 0x80000001) { + X86_MC::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX); + if ((EDX >> 29) & 0x1) { + HasX86_64 = true; + ToggleFeature(X86::Feature64Bit); + } + if ((ECX >> 5) & 0x1) { + HasLZCNT = true; + ToggleFeature(X86::FeatureLZCNT); + } + if (IsIntel && ((ECX >> 8) & 0x1)) { + HasPRFCHW = true; + ToggleFeature(X86::FeaturePRFCHW); + } + if (IsAMD) { + if ((ECX >> 6) & 0x1) { + HasSSE4A = true; + ToggleFeature(X86::FeatureSSE4A); + } + if ((ECX >> 11) & 0x1) { + HasXOP = true; + ToggleFeature(X86::FeatureXOP); + } + if ((ECX >> 16) & 0x1) { + HasFMA4 = true; + ToggleFeature(X86::FeatureFMA4); + } + } + } + } + + if (MaxLevel >= 7) { + if (!X86_MC::GetCpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX)) { + if (IsIntel && (EBX & 0x1)) { + HasFSGSBase = true; + ToggleFeature(X86::FeatureFSGSBase); + } + if ((EBX >> 3) & 0x1) { + HasBMI = true; + ToggleFeature(X86::FeatureBMI); + } + if ((EBX >> 4) & 0x1) { + HasHLE = true; + ToggleFeature(X86::FeatureHLE); + } + if (IsIntel && ((EBX >> 5) & 0x1)) { + X86SSELevel = AVX2; + ToggleFeature(X86::FeatureAVX2); + } + if (IsIntel && ((EBX >> 8) & 0x1)) { + HasBMI2 = true; + ToggleFeature(X86::FeatureBMI2); + } + if (IsIntel && ((EBX >> 11) & 0x1)) { + HasRTM = true; + ToggleFeature(X86::FeatureRTM); + } + if (IsIntel && ((EBX >> 19) & 0x1)) { + HasADX = true; + ToggleFeature(X86::FeatureADX); + } + if (IsIntel && ((EBX >> 18) & 0x1)) { + HasRDSEED = true; + ToggleFeature(X86::FeatureRDSEED); + } + } + } +} + +void X86Subtarget::resetSubtargetFeatures(const MachineFunction *MF) { + AttributeSet FnAttrs = MF->getFunction()->getAttributes(); + Attribute CPUAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex, + "target-cpu"); + Attribute FSAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex, + "target-features"); + std::string CPU = + !CPUAttr.hasAttribute(Attribute::None) ?CPUAttr.getValueAsString() : ""; + std::string FS = + !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : ""; + if (!FS.empty()) { + initializeEnvironment(); + resetSubtargetFeatures(CPU, FS); + } +} + +void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) { + std::string CPUName = CPU; + if (!FS.empty() || !CPU.empty()) { + if (CPUName.empty()) { +#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\ + || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64) + CPUName = sys::getHostCPUName(); +#else + CPUName = "generic"; +#endif + } + + // Make sure 64-bit features are available in 64-bit mode. (But make sure + // SSE2 can be turned off explicitly.) + std::string FullFS = FS; + if (In64BitMode) { + if (!FullFS.empty()) + FullFS = "+64bit,+sse2," + FullFS; + else + FullFS = "+64bit,+sse2"; + } + + // If feature string is not empty, parse features string. + ParseSubtargetFeatures(CPUName, FullFS); + } else { + if (CPUName.empty()) { +#if defined (__x86_64__) || defined(__i386__) + CPUName = sys::getHostCPUName(); +#else + CPUName = "generic"; +#endif + } + // Otherwise, use CPUID to auto-detect feature set. + AutoDetectSubtargetFeatures(); + + // Make sure 64-bit features are available in 64-bit mode. + if (In64BitMode) { + HasX86_64 = true; ToggleFeature(X86::Feature64Bit); + HasCMov = true; ToggleFeature(X86::FeatureCMOV); + + if (X86SSELevel < SSE2) { + X86SSELevel = SSE2; + ToggleFeature(X86::FeatureSSE1); + ToggleFeature(X86::FeatureSSE2); + } + } + } + + // CPUName may have been set by the CPU detection code. Make sure the + // new MCSchedModel is used. + InitMCProcessorInfo(CPUName, FS); + + if (X86ProcFamily == IntelAtom) + PostRAScheduler = true; + + InstrItins = getInstrItineraryForCPU(CPUName); + + // It's important to keep the MCSubtargetInfo feature bits in sync with + // target data structure which is shared with MC code emitter, etc. + if (In64BitMode) + ToggleFeature(X86::Mode64Bit); + + DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel + << ", 3DNowLevel " << X863DNowLevel + << ", 64bit " << HasX86_64 << "\n"); + assert((!In64BitMode || HasX86_64) && + "64-bit code requested on a subtarget that doesn't support it!"); + + // Stack alignment is 16 bytes on Darwin, Linux and Solaris (both + // 32 and 64 bit) and for all 64-bit targets. + if (StackAlignOverride) + stackAlignment = StackAlignOverride; + else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() || + In64BitMode) + stackAlignment = 16; +} + +void X86Subtarget::initializeEnvironment() { + X86SSELevel = NoMMXSSE; + X863DNowLevel = NoThreeDNow; + HasCMov = false; + HasX86_64 = false; + HasPOPCNT = false; + HasSSE4A = false; + HasAES = false; + HasPCLMUL = false; + HasFMA = false; + HasFMA4 = false; + HasXOP = false; + HasMOVBE = false; + HasRDRAND = false; + HasF16C = false; + HasFSGSBase = false; + HasLZCNT = false; + HasBMI = false; + HasBMI2 = false; + HasRTM = false; + HasHLE = false; + HasADX = false; + HasPRFCHW = false; + HasRDSEED = false; + IsBTMemSlow = false; + IsUAMemFast = false; + HasVectorUAMem = false; + HasCmpxchg16b = false; + UseLeaForSP = false; + HasSlowDivide = false; + PostRAScheduler = false; + PadShortFunctions = false; + CallRegIndirect = false; + LEAUsesAG = false; + stackAlignment = 4; + // FIXME: this is a known good value for Yonah. How about others? + MaxInlineSizeThreshold = 128; +} + +X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, + const std::string &FS, + unsigned StackAlignOverride, bool is64Bit) + : X86GenSubtargetInfo(TT, CPU, FS) + , X86ProcFamily(Others) + , PICStyle(PICStyles::None) + , TargetTriple(TT) + , StackAlignOverride(StackAlignOverride) + , In64BitMode(is64Bit) { + initializeEnvironment(); + resetSubtargetFeatures(CPU, FS); +} + +bool X86Subtarget::enablePostRAScheduler( + CodeGenOpt::Level OptLevel, + TargetSubtargetInfo::AntiDepBreakMode& Mode, + RegClassVector& CriticalPathRCs) const { + Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL; + CriticalPathRCs.clear(); + return PostRAScheduler && OptLevel >= CodeGenOpt::Default; +} diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h new file mode 100644 index 0000000..66832b9 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h @@ -0,0 +1,383 @@ +//===-- X86Subtarget.h - Define Subtarget for the X86 ----------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the X86 specific subclass of TargetSubtargetInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef X86SUBTARGET_H +#define X86SUBTARGET_H + +#include "llvm/ADT/Triple.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <string> + +#define GET_SUBTARGETINFO_HEADER +#include "X86GenSubtargetInfo.inc" + +namespace llvm { +class GlobalValue; +class StringRef; +class TargetMachine; + +/// PICStyles - The X86 backend supports a number of different styles of PIC. +/// +namespace PICStyles { +enum Style { + StubPIC, // Used on i386-darwin in -fPIC mode. + StubDynamicNoPIC, // Used on i386-darwin in -mdynamic-no-pic mode. + GOT, // Used on many 32-bit unices in -fPIC mode. + RIPRel, // Used on X86-64 when not in -static mode. + None // Set when in -static mode (not PIC or DynamicNoPIC mode). +}; +} + +class X86Subtarget : public X86GenSubtargetInfo { +protected: + enum X86SSEEnum { + NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2 + }; + + enum X863DNowEnum { + NoThreeDNow, ThreeDNow, ThreeDNowA + }; + + enum X86ProcFamilyEnum { + Others, IntelAtom + }; + + /// X86ProcFamily - X86 processor family: Intel Atom, and others + X86ProcFamilyEnum X86ProcFamily; + + /// PICStyle - Which PIC style to use + /// + PICStyles::Style PICStyle; + + /// X86SSELevel - MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or + /// none supported. + X86SSEEnum X86SSELevel; + + /// X863DNowLevel - 3DNow or 3DNow Athlon, or none supported. + /// + X863DNowEnum X863DNowLevel; + + /// HasCMov - True if this processor has conditional move instructions + /// (generally pentium pro+). + bool HasCMov; + + /// HasX86_64 - True if the processor supports X86-64 instructions. + /// + bool HasX86_64; + + /// HasPOPCNT - True if the processor supports POPCNT. + bool HasPOPCNT; + + /// HasSSE4A - True if the processor supports SSE4A instructions. + bool HasSSE4A; + + /// HasAES - Target has AES instructions + bool HasAES; + + /// HasPCLMUL - Target has carry-less multiplication + bool HasPCLMUL; + + /// HasFMA - Target has 3-operand fused multiply-add + bool HasFMA; + + /// HasFMA4 - Target has 4-operand fused multiply-add + bool HasFMA4; + + /// HasXOP - Target has XOP instructions + bool HasXOP; + + /// HasMOVBE - True if the processor has the MOVBE instruction. + bool HasMOVBE; + + /// HasRDRAND - True if the processor has the RDRAND instruction. + bool HasRDRAND; + + /// HasF16C - Processor has 16-bit floating point conversion instructions. + bool HasF16C; + + /// HasFSGSBase - Processor has FS/GS base insturctions. + bool HasFSGSBase; + + /// HasLZCNT - Processor has LZCNT instruction. + bool HasLZCNT; + + /// HasBMI - Processor has BMI1 instructions. + bool HasBMI; + + /// HasBMI2 - Processor has BMI2 instructions. + bool HasBMI2; + + /// HasRTM - Processor has RTM instructions. + bool HasRTM; + + /// HasHLE - Processor has HLE. + bool HasHLE; + + /// HasADX - Processor has ADX instructions. + bool HasADX; + + /// HasPRFCHW - Processor has PRFCHW instructions. + bool HasPRFCHW; + + /// HasRDSEED - Processor has RDSEED instructions. + bool HasRDSEED; + + /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow. + bool IsBTMemSlow; + + /// IsUAMemFast - True if unaligned memory access is fast. + bool IsUAMemFast; + + /// HasVectorUAMem - True if SIMD operations can have unaligned memory + /// operands. This may require setting a feature bit in the processor. + bool HasVectorUAMem; + + /// HasCmpxchg16b - True if this processor has the CMPXCHG16B instruction; + /// this is true for most x86-64 chips, but not the first AMD chips. + bool HasCmpxchg16b; + + /// UseLeaForSP - True if the LEA instruction should be used for adjusting + /// the stack pointer. This is an optimization for Intel Atom processors. + bool UseLeaForSP; + + /// HasSlowDivide - True if smaller divides are significantly faster than + /// full divides and should be used when possible. + bool HasSlowDivide; + + /// PostRAScheduler - True if using post-register-allocation scheduler. + bool PostRAScheduler; + + /// PadShortFunctions - True if the short functions should be padded to prevent + /// a stall when returning too early. + bool PadShortFunctions; + + /// CallRegIndirect - True if the Calls with memory reference should be converted + /// to a register-based indirect call. + bool CallRegIndirect; + /// LEAUsesAG - True if the LEA instruction inputs have to be ready at + /// address generation (AG) time. + bool LEAUsesAG; + + /// stackAlignment - The minimum alignment known to hold of the stack frame on + /// entry to the function and which must be maintained by every function. + unsigned stackAlignment; + + /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops. + /// + unsigned MaxInlineSizeThreshold; + + /// TargetTriple - What processor and OS we're targeting. + Triple TargetTriple; + + /// Instruction itineraries for scheduling + InstrItineraryData InstrItins; + +private: + /// StackAlignOverride - Override the stack alignment. + unsigned StackAlignOverride; + + /// In64BitMode - True if compiling for 64-bit, false for 32-bit. + bool In64BitMode; + +public: + /// This constructor initializes the data members to match that + /// of the specified triple. + /// + X86Subtarget(const std::string &TT, const std::string &CPU, + const std::string &FS, + unsigned StackAlignOverride, bool is64Bit); + + /// getStackAlignment - Returns the minimum alignment known to hold of the + /// stack frame on entry to the function and which must be maintained by every + /// function for this subtarget. + unsigned getStackAlignment() const { return stackAlignment; } + + /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size + /// that still makes it profitable to inline the call. + unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; } + + /// ParseSubtargetFeatures - Parses features string setting specified + /// subtarget options. Definition of function is auto generated by tblgen. + void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + + /// AutoDetectSubtargetFeatures - Auto-detect CPU features using CPUID + /// instruction. + void AutoDetectSubtargetFeatures(); + + /// \brief Reset the features for the X86 target. + virtual void resetSubtargetFeatures(const MachineFunction *MF); +private: + void initializeEnvironment(); + void resetSubtargetFeatures(StringRef CPU, StringRef FS); +public: + /// Is this x86_64? (disregarding specific ABI / programming model) + bool is64Bit() const { + return In64BitMode; + } + + /// Is this x86_64 with the ILP32 programming model (x32 ABI)? + bool isTarget64BitILP32() const { + return In64BitMode && (TargetTriple.getEnvironment() == Triple::GNUX32); + } + + /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)? + bool isTarget64BitLP64() const { + return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32); + } + + PICStyles::Style getPICStyle() const { return PICStyle; } + void setPICStyle(PICStyles::Style Style) { PICStyle = Style; } + + bool hasCMov() const { return HasCMov; } + bool hasMMX() const { return X86SSELevel >= MMX; } + bool hasSSE1() const { return X86SSELevel >= SSE1; } + bool hasSSE2() const { return X86SSELevel >= SSE2; } + bool hasSSE3() const { return X86SSELevel >= SSE3; } + bool hasSSSE3() const { return X86SSELevel >= SSSE3; } + bool hasSSE41() const { return X86SSELevel >= SSE41; } + bool hasSSE42() const { return X86SSELevel >= SSE42; } + bool hasAVX() const { return X86SSELevel >= AVX; } + bool hasAVX2() const { return X86SSELevel >= AVX2; } + bool hasFp256() const { return hasAVX(); } + bool hasInt256() const { return hasAVX2(); } + bool hasSSE4A() const { return HasSSE4A; } + bool has3DNow() const { return X863DNowLevel >= ThreeDNow; } + bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; } + bool hasPOPCNT() const { return HasPOPCNT; } + bool hasAES() const { return HasAES; } + bool hasPCLMUL() const { return HasPCLMUL; } + bool hasFMA() const { return HasFMA; } + // FIXME: Favor FMA when both are enabled. Is this the right thing to do? + bool hasFMA4() const { return HasFMA4 && !HasFMA; } + bool hasXOP() const { return HasXOP; } + bool hasMOVBE() const { return HasMOVBE; } + bool hasRDRAND() const { return HasRDRAND; } + bool hasF16C() const { return HasF16C; } + bool hasFSGSBase() const { return HasFSGSBase; } + bool hasLZCNT() const { return HasLZCNT; } + bool hasBMI() const { return HasBMI; } + bool hasBMI2() const { return HasBMI2; } + bool hasRTM() const { return HasRTM; } + bool hasHLE() const { return HasHLE; } + bool hasADX() const { return HasADX; } + bool hasPRFCHW() const { return HasPRFCHW; } + bool hasRDSEED() const { return HasRDSEED; } + bool isBTMemSlow() const { return IsBTMemSlow; } + bool isUnalignedMemAccessFast() const { return IsUAMemFast; } + bool hasVectorUAMem() const { return HasVectorUAMem; } + bool hasCmpxchg16b() const { return HasCmpxchg16b; } + bool useLeaForSP() const { return UseLeaForSP; } + bool hasSlowDivide() const { return HasSlowDivide; } + bool padShortFunctions() const { return PadShortFunctions; } + bool callRegIndirect() const { return CallRegIndirect; } + bool LEAusesAG() const { return LEAUsesAG; } + + bool isAtom() const { return X86ProcFamily == IntelAtom; } + + const Triple &getTargetTriple() const { return TargetTriple; } + + bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } + bool isTargetFreeBSD() const { + return TargetTriple.getOS() == Triple::FreeBSD; + } + bool isTargetSolaris() const { + return TargetTriple.getOS() == Triple::Solaris; + } + bool isTargetELF() const { + return (TargetTriple.getEnvironment() == Triple::ELF || + TargetTriple.isOSBinFormatELF()); + } + bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; } + bool isTargetNaCl() const { + return TargetTriple.getOS() == Triple::NaCl; + } + bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); } + bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); } + bool isTargetWindows() const { return TargetTriple.getOS() == Triple::Win32; } + bool isTargetMingw() const { return TargetTriple.getOS() == Triple::MinGW32; } + bool isTargetCygwin() const { return TargetTriple.getOS() == Triple::Cygwin; } + bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); } + bool isTargetCOFF() const { + return (TargetTriple.getEnvironment() != Triple::ELF && + TargetTriple.isOSBinFormatCOFF()); + } + bool isTargetEnvMacho() const { return TargetTriple.isEnvironmentMachO(); } + + bool isTargetWin64() const { + // FIXME: x86_64-cygwin has not been released yet. + return In64BitMode && TargetTriple.isOSWindows(); + } + + bool isTargetWin32() const { + // FIXME: Cygwin is included for isTargetWin64 -- should it be included + // here too? + return !In64BitMode && (isTargetMingw() || isTargetWindows()); + } + + bool isPICStyleSet() const { return PICStyle != PICStyles::None; } + bool isPICStyleGOT() const { return PICStyle == PICStyles::GOT; } + bool isPICStyleRIPRel() const { return PICStyle == PICStyles::RIPRel; } + + bool isPICStyleStubPIC() const { + return PICStyle == PICStyles::StubPIC; + } + + bool isPICStyleStubNoDynamic() const { + return PICStyle == PICStyles::StubDynamicNoPIC; + } + bool isPICStyleStubAny() const { + return PICStyle == PICStyles::StubDynamicNoPIC || + PICStyle == PICStyles::StubPIC; } + + /// ClassifyGlobalReference - Classify a global variable reference for the + /// current subtarget according to how we should reference it in a non-pcrel + /// context. + unsigned char ClassifyGlobalReference(const GlobalValue *GV, + const TargetMachine &TM)const; + + /// ClassifyBlockAddressReference - Classify a blockaddress reference for the + /// current subtarget according to how we should reference it in a non-pcrel + /// context. + unsigned char ClassifyBlockAddressReference() const; + + /// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls + /// to immediate address. + bool IsLegalToCallImmediateAddr(const TargetMachine &TM) const; + + /// This function returns the name of a function which has an interface + /// like the non-standard bzero function, if such a function exists on + /// the current subtarget and it is considered prefereable over + /// memset with zero passed as the second argument. Otherwise it + /// returns null. + const char *getBZeroEntry() const; + + /// This function returns true if the target has sincos() routine in its + /// compiler runtime or math libraries. + bool hasSinCos() const; + + /// enablePostRAScheduler - run for Atom optimization. + bool enablePostRAScheduler(CodeGenOpt::Level OptLevel, + TargetSubtargetInfo::AntiDepBreakMode& Mode, + RegClassVector& CriticalPathRCs) const; + + bool postRAScheduler() const { return PostRAScheduler; } + + /// getInstrItins = Return the instruction itineraries based on the + /// subtarget selection. + const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp new file mode 100644 index 0000000..00fa47f --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -0,0 +1,232 @@ +//===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the X86 specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#include "X86TargetMachine.h" +#include "X86.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/PassManager.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +extern "C" void LLVMInitializeX86Target() { + // Register the target. + RegisterTargetMachine<X86_32TargetMachine> X(TheX86_32Target); + RegisterTargetMachine<X86_64TargetMachine> Y(TheX86_64Target); +} + +void X86_32TargetMachine::anchor() { } + +X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : X86TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false), + DL(getSubtargetImpl()->isTargetDarwin() ? + "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-f128:128:128-" + "n8:16:32-S128" : + (getSubtargetImpl()->isTargetCygMing() || + getSubtargetImpl()->isTargetWindows()) ? + "e-p:32:32-f64:64:64-i64:64:64-f80:32:32-f128:128:128-" + "n8:16:32-S32" : + "e-p:32:32-f64:32:64-i64:32:64-f80:32:32-f128:128:128-" + "n8:16:32-S128"), + InstrInfo(*this), + TLInfo(*this), + TSInfo(*this), + JITInfo(*this) { +} + +void X86_64TargetMachine::anchor() { } + +X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : X86TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true), + // The x32 ABI dictates the ILP32 programming model for x64. + DL(getSubtargetImpl()->isTarget64BitILP32() ? + "e-p:32:32-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-" + "n8:16:32:64-S128" : + "e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-" + "n8:16:32:64-S128"), + InstrInfo(*this), + TLInfo(*this), + TSInfo(*this), + JITInfo(*this) { +} + +/// X86TargetMachine ctor - Create an X86 target. +/// +X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL, + bool is64Bit) + : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + Subtarget(TT, CPU, FS, Options.StackAlignmentOverride, is64Bit), + FrameLowering(*this, Subtarget), + InstrItins(Subtarget.getInstrItineraryData()){ + // Determine the PICStyle based on the target selected. + if (getRelocationModel() == Reloc::Static) { + // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None. + Subtarget.setPICStyle(PICStyles::None); + } else if (Subtarget.is64Bit()) { + // PIC in 64 bit mode is always rip-rel. + Subtarget.setPICStyle(PICStyles::RIPRel); + } else if (Subtarget.isTargetCygMing()) { + Subtarget.setPICStyle(PICStyles::None); + } else if (Subtarget.isTargetDarwin()) { + if (getRelocationModel() == Reloc::PIC_) + Subtarget.setPICStyle(PICStyles::StubPIC); + else { + assert(getRelocationModel() == Reloc::DynamicNoPIC); + Subtarget.setPICStyle(PICStyles::StubDynamicNoPIC); + } + } else if (Subtarget.isTargetELF()) { + Subtarget.setPICStyle(PICStyles::GOT); + } + + // default to hard float ABI + if (Options.FloatABIType == FloatABI::Default) + this->Options.FloatABIType = FloatABI::Hard; +} + +//===----------------------------------------------------------------------===// +// Command line options for x86 +//===----------------------------------------------------------------------===// +static cl::opt<bool> +UseVZeroUpper("x86-use-vzeroupper", + cl::desc("Minimize AVX to SSE transition penalty"), + cl::init(true)); + +// Temporary option to control early if-conversion for x86 while adding machine +// models. +static cl::opt<bool> +X86EarlyIfConv("x86-early-ifcvt", + cl::desc("Enable early if-conversion on X86")); + +//===----------------------------------------------------------------------===// +// X86 Analysis Pass Setup +//===----------------------------------------------------------------------===// + +void X86TargetMachine::addAnalysisPasses(PassManagerBase &PM) { + // Add first the target-independent BasicTTI pass, then our X86 pass. This + // allows the X86 pass to delegate to the target independent layer when + // appropriate. + PM.add(createBasicTargetTransformInfoPass(getTargetLowering())); + PM.add(createX86TargetTransformInfoPass(this)); +} + + +//===----------------------------------------------------------------------===// +// Pass Pipeline Configuration +//===----------------------------------------------------------------------===// + +namespace { +/// X86 Code Generator Pass Configuration Options. +class X86PassConfig : public TargetPassConfig { +public: + X86PassConfig(X86TargetMachine *TM, PassManagerBase &PM) + : TargetPassConfig(TM, PM) {} + + X86TargetMachine &getX86TargetMachine() const { + return getTM<X86TargetMachine>(); + } + + const X86Subtarget &getX86Subtarget() const { + return *getX86TargetMachine().getSubtargetImpl(); + } + + virtual bool addInstSelector(); + virtual bool addILPOpts(); + virtual bool addPreRegAlloc(); + virtual bool addPostRegAlloc(); + virtual bool addPreEmitPass(); +}; +} // namespace + +TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) { + return new X86PassConfig(this, PM); +} + +bool X86PassConfig::addInstSelector() { + // Install an instruction selector. + addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel())); + + // For ELF, cleanup any local-dynamic TLS accesses. + if (getX86Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None) + addPass(createCleanupLocalDynamicTLSPass()); + + // For 32-bit, prepend instructions to set the "global base reg" for PIC. + if (!getX86Subtarget().is64Bit()) + addPass(createGlobalBaseRegPass()); + + return false; +} + +bool X86PassConfig::addILPOpts() { + if (X86EarlyIfConv && getX86Subtarget().hasCMov()) { + addPass(&EarlyIfConverterID); + return true; + } + return false; +} + +bool X86PassConfig::addPreRegAlloc() { + return false; // -print-machineinstr shouldn't print after this. +} + +bool X86PassConfig::addPostRegAlloc() { + addPass(createX86FloatingPointStackifierPass()); + return true; // -print-machineinstr should print after this. +} + +bool X86PassConfig::addPreEmitPass() { + bool ShouldPrint = false; + if (getOptLevel() != CodeGenOpt::None && getX86Subtarget().hasSSE2()) { + addPass(createExecutionDependencyFixPass(&X86::VR128RegClass)); + ShouldPrint = true; + } + + if (getX86Subtarget().hasAVX() && UseVZeroUpper) { + addPass(createX86IssueVZeroUpperPass()); + ShouldPrint = true; + } + + if (getOptLevel() != CodeGenOpt::None && + getX86Subtarget().padShortFunctions()) { + addPass(createX86PadShortFunctions()); + ShouldPrint = true; + } + if (getOptLevel() != CodeGenOpt::None && + getX86Subtarget().LEAusesAG()){ + addPass(createX86FixupLEAs()); + ShouldPrint = true; + } + + return ShouldPrint; +} + +bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM, + JITCodeEmitter &JCE) { + PM.add(createX86JITCodeEmitterPass(*this, JCE)); + + return false; +} diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm/lib/Target/X86/X86TargetMachine.h new file mode 100644 index 0000000..174d391 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.h @@ -0,0 +1,137 @@ +//===-- X86TargetMachine.h - Define TargetMachine for the X86 ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the X86 specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#ifndef X86TARGETMACHINE_H +#define X86TARGETMACHINE_H + +#include "X86.h" +#include "X86FrameLowering.h" +#include "X86ISelLowering.h" +#include "X86InstrInfo.h" +#include "X86JITInfo.h" +#include "X86SelectionDAGInfo.h" +#include "X86Subtarget.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + +class StringRef; + +class X86TargetMachine : public LLVMTargetMachine { + X86Subtarget Subtarget; + X86FrameLowering FrameLowering; + InstrItineraryData InstrItins; + +public: + X86TargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL, + bool is64Bit); + + virtual const X86InstrInfo *getInstrInfo() const { + llvm_unreachable("getInstrInfo not implemented"); + } + virtual const TargetFrameLowering *getFrameLowering() const { + return &FrameLowering; + } + virtual X86JITInfo *getJITInfo() { + llvm_unreachable("getJITInfo not implemented"); + } + virtual const X86Subtarget *getSubtargetImpl() const{ return &Subtarget; } + virtual const X86TargetLowering *getTargetLowering() const { + llvm_unreachable("getTargetLowering not implemented"); + } + virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const { + llvm_unreachable("getSelectionDAGInfo not implemented"); + } + virtual const X86RegisterInfo *getRegisterInfo() const { + return &getInstrInfo()->getRegisterInfo(); + } + virtual const InstrItineraryData *getInstrItineraryData() const { + return &InstrItins; + } + + /// \brief Register X86 analysis passes with a pass manager. + virtual void addAnalysisPasses(PassManagerBase &PM); + + // Set up the pass pipeline. + virtual TargetPassConfig *createPassConfig(PassManagerBase &PM); + + virtual bool addCodeEmitter(PassManagerBase &PM, + JITCodeEmitter &JCE); +}; + +/// X86_32TargetMachine - X86 32-bit target machine. +/// +class X86_32TargetMachine : public X86TargetMachine { + virtual void anchor(); + const DataLayout DL; // Calculates type size & alignment + X86InstrInfo InstrInfo; + X86TargetLowering TLInfo; + X86SelectionDAGInfo TSInfo; + X86JITInfo JITInfo; +public: + X86_32TargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); + virtual const DataLayout *getDataLayout() const { return &DL; } + virtual const X86TargetLowering *getTargetLowering() const { + return &TLInfo; + } + virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const { + return &TSInfo; + } + virtual const X86InstrInfo *getInstrInfo() const { + return &InstrInfo; + } + virtual X86JITInfo *getJITInfo() { + return &JITInfo; + } +}; + +/// X86_64TargetMachine - X86 64-bit target machine. +/// +class X86_64TargetMachine : public X86TargetMachine { + virtual void anchor(); + const DataLayout DL; // Calculates type size & alignment + X86InstrInfo InstrInfo; + X86TargetLowering TLInfo; + X86SelectionDAGInfo TSInfo; + X86JITInfo JITInfo; +public: + X86_64TargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); + virtual const DataLayout *getDataLayout() const { return &DL; } + virtual const X86TargetLowering *getTargetLowering() const { + return &TLInfo; + } + virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const { + return &TSInfo; + } + virtual const X86InstrInfo *getInstrInfo() const { + return &InstrInfo; + } + virtual X86JITInfo *getJITInfo() { + return &JITInfo; + } +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp new file mode 100644 index 0000000..871dacd --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp @@ -0,0 +1,49 @@ +//===-- X86TargetObjectFile.cpp - X86 Object Info -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "X86TargetObjectFile.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/Support/Dwarf.h" +#include "llvm/Target/Mangler.h" + +using namespace llvm; +using namespace dwarf; + +const MCExpr *X86_64MachoTargetObjectFile:: +getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang, + MachineModuleInfo *MMI, unsigned Encoding, + MCStreamer &Streamer) const { + + // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which + // is an indirect pc-relative reference. + if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) { + const MCSymbol *Sym = Mang->getSymbol(GV); + const MCExpr *Res = + MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext()); + const MCExpr *Four = MCConstantExpr::Create(4, getContext()); + return MCBinaryExpr::CreateAdd(Res, Four, getContext()); + } + + return TargetLoweringObjectFileMachO:: + getTTypeGlobalReference(GV, Mang, MMI, Encoding, Streamer); +} + +MCSymbol *X86_64MachoTargetObjectFile:: +getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang, + MachineModuleInfo *MMI) const { + return Mang->getSymbol(GV); +} + +void +X86LinuxTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { + TargetLoweringObjectFileELF::Initialize(Ctx, TM); + InitializeELF(TM.Options.UseInitArray); +} diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h new file mode 100644 index 0000000..9d26d38 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h @@ -0,0 +1,43 @@ +//===-- X86TargetObjectFile.h - X86 Object Info -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_X86_TARGETOBJECTFILE_H +#define LLVM_TARGET_X86_TARGETOBJECTFILE_H + +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + + /// X86_64MachoTargetObjectFile - This TLOF implementation is used for Darwin + /// x86-64. + class X86_64MachoTargetObjectFile : public TargetLoweringObjectFileMachO { + public: + virtual const MCExpr * + getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang, + MachineModuleInfo *MMI, unsigned Encoding, + MCStreamer &Streamer) const; + + // getCFIPersonalitySymbol - The symbol that gets passed to + // .cfi_personality. + virtual MCSymbol * + getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang, + MachineModuleInfo *MMI) const; + }; + + /// X86LinuxTargetObjectFile - This implementation is used for linux x86 + /// and x86-64. + class X86LinuxTargetObjectFile : public TargetLoweringObjectFileELF { + virtual void Initialize(MCContext &Ctx, const TargetMachine &TM); + }; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp new file mode 100644 index 0000000..eba9d78 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -0,0 +1,530 @@ +//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements a TargetTransformInfo analysis pass specific to the +/// X86 target machine. It uses the target's detailed information to provide +/// more precise answers to certain TTI queries, while letting the target +/// independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "x86tti" +#include "X86.h" +#include "X86TargetMachine.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/CostTable.h" +using namespace llvm; + +// Declare the pass initialization routine locally as target-specific passes +// don't havve a target-wide initialization entry point, and so we rely on the +// pass constructor initialization. +namespace llvm { +void initializeX86TTIPass(PassRegistry &); +} + +namespace { + +class X86TTI : public ImmutablePass, public TargetTransformInfo { + const X86TargetMachine *TM; + const X86Subtarget *ST; + const X86TargetLowering *TLI; + + /// Estimate the overhead of scalarizing an instruction. Insert and Extract + /// are set if the result needs to be inserted and/or extracted from vectors. + unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; + +public: + X86TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) { + llvm_unreachable("This pass cannot be directly constructed"); + } + + X86TTI(const X86TargetMachine *TM) + : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), + TLI(TM->getTargetLowering()) { + initializeX86TTIPass(*PassRegistry::getPassRegistry()); + } + + virtual void initializePass() { + pushTTIStack(this); + } + + virtual void finalizePass() { + popTTIStack(); + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + TargetTransformInfo::getAnalysisUsage(AU); + } + + /// Pass identification. + static char ID; + + /// Provide necessary pointer adjustments for the two base classes. + virtual void *getAdjustedAnalysisPointer(const void *ID) { + if (ID == &TargetTransformInfo::ID) + return (TargetTransformInfo*)this; + return this; + } + + /// \name Scalar TTI Implementations + /// @{ + virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const; + + /// @} + + /// \name Vector TTI Implementations + /// @{ + + virtual unsigned getNumberOfRegisters(bool Vector) const; + virtual unsigned getRegisterBitWidth(bool Vector) const; + virtual unsigned getMaximumUnrollFactor() const; + virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, + OperandValueKind, + OperandValueKind) const; + virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, + int Index, Type *SubTp) const; + virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst, + Type *Src) const; + virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, + Type *CondTy) const; + virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index) const; + virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, + unsigned AddressSpace) const; + + /// @} +}; + +} // end anonymous namespace + +INITIALIZE_AG_PASS(X86TTI, TargetTransformInfo, "x86tti", + "X86 Target Transform Info", true, true, false) +char X86TTI::ID = 0; + +ImmutablePass * +llvm::createX86TargetTransformInfoPass(const X86TargetMachine *TM) { + return new X86TTI(TM); +} + + +//===----------------------------------------------------------------------===// +// +// X86 cost model. +// +//===----------------------------------------------------------------------===// + +X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const { + assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); + // TODO: Currently the __builtin_popcount() implementation using SSE3 + // instructions is inefficient. Once the problem is fixed, we should + // call ST->hasSSE3() instead of ST->hasSSE4(). + return ST->hasSSE41() ? PSK_FastHardware : PSK_Software; +} + +unsigned X86TTI::getNumberOfRegisters(bool Vector) const { + if (Vector && !ST->hasSSE1()) + return 0; + + if (ST->is64Bit()) + return 16; + return 8; +} + +unsigned X86TTI::getRegisterBitWidth(bool Vector) const { + if (Vector) { + if (ST->hasAVX()) return 256; + if (ST->hasSSE1()) return 128; + return 0; + } + + if (ST->is64Bit()) + return 64; + return 32; + +} + +unsigned X86TTI::getMaximumUnrollFactor() const { + if (ST->isAtom()) + return 1; + + // Sandybridge and Haswell have multiple execution ports and pipelined + // vector units. + if (ST->hasAVX()) + return 4; + + return 2; +} + +unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, + OperandValueKind Op1Info, + OperandValueKind Op2Info) const { + // Legalize the type. + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + static const CostTblEntry<MVT> AVX2CostTable[] = { + // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to + // customize them to detect the cases where shift amount is a scalar one. + { ISD::SHL, MVT::v4i32, 1 }, + { ISD::SRL, MVT::v4i32, 1 }, + { ISD::SRA, MVT::v4i32, 1 }, + { ISD::SHL, MVT::v8i32, 1 }, + { ISD::SRL, MVT::v8i32, 1 }, + { ISD::SRA, MVT::v8i32, 1 }, + { ISD::SHL, MVT::v2i64, 1 }, + { ISD::SRL, MVT::v2i64, 1 }, + { ISD::SHL, MVT::v4i64, 1 }, + { ISD::SRL, MVT::v4i64, 1 }, + + { ISD::SHL, MVT::v32i8, 42 }, // cmpeqb sequence. + { ISD::SHL, MVT::v16i16, 16*10 }, // Scalarized. + + { ISD::SRL, MVT::v32i8, 32*10 }, // Scalarized. + { ISD::SRL, MVT::v16i16, 8*10 }, // Scalarized. + + { ISD::SRA, MVT::v32i8, 32*10 }, // Scalarized. + { ISD::SRA, MVT::v16i16, 16*10 }, // Scalarized. + { ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized. + }; + + // Look for AVX2 lowering tricks. + if (ST->hasAVX2()) { + int Idx = CostTableLookup<MVT>(AVX2CostTable, array_lengthof(AVX2CostTable), + ISD, LT.second); + if (Idx != -1) + return LT.first * AVX2CostTable[Idx].Cost; + } + + static const CostTblEntry<MVT> SSE2UniformConstCostTable[] = { + // We don't correctly identify costs of casts because they are marked as + // custom. + // Constant splats are cheaper for the following instructions. + { ISD::SHL, MVT::v16i8, 1 }, // psllw. + { ISD::SHL, MVT::v8i16, 1 }, // psllw. + { ISD::SHL, MVT::v4i32, 1 }, // pslld + { ISD::SHL, MVT::v2i64, 1 }, // psllq. + + { ISD::SRL, MVT::v16i8, 1 }, // psrlw. + { ISD::SRL, MVT::v8i16, 1 }, // psrlw. + { ISD::SRL, MVT::v4i32, 1 }, // psrld. + { ISD::SRL, MVT::v2i64, 1 }, // psrlq. + + { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. + { ISD::SRA, MVT::v8i16, 1 }, // psraw. + { ISD::SRA, MVT::v4i32, 1 }, // psrad. + }; + + if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && + ST->hasSSE2()) { + int Idx = CostTableLookup<MVT>(SSE2UniformConstCostTable, + array_lengthof(SSE2UniformConstCostTable), + ISD, LT.second); + if (Idx != -1) + return LT.first * SSE2UniformConstCostTable[Idx].Cost; + } + + + static const CostTblEntry<MVT> SSE2CostTable[] = { + // We don't correctly identify costs of casts because they are marked as + // custom. + // For some cases, where the shift amount is a scalar we would be able + // to generate better code. Unfortunately, when this is the case the value + // (the splat) will get hoisted out of the loop, thereby making it invisible + // to ISel. The cost model must return worst case assumptions because it is + // used for vectorization and we don't want to make vectorized code worse + // than scalar code. + { ISD::SHL, MVT::v16i8, 30 }, // cmpeqb sequence. + { ISD::SHL, MVT::v8i16, 8*10 }, // Scalarized. + { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. + { ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized. + + { ISD::SRL, MVT::v16i8, 16*10 }, // Scalarized. + { ISD::SRL, MVT::v8i16, 8*10 }, // Scalarized. + { ISD::SRL, MVT::v4i32, 4*10 }, // Scalarized. + { ISD::SRL, MVT::v2i64, 2*10 }, // Scalarized. + + { ISD::SRA, MVT::v16i8, 16*10 }, // Scalarized. + { ISD::SRA, MVT::v8i16, 8*10 }, // Scalarized. + { ISD::SRA, MVT::v4i32, 4*10 }, // Scalarized. + { ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized. + }; + + if (ST->hasSSE2()) { + int Idx = CostTableLookup<MVT>(SSE2CostTable, array_lengthof(SSE2CostTable), + ISD, LT.second); + if (Idx != -1) + return LT.first * SSE2CostTable[Idx].Cost; + } + + static const CostTblEntry<MVT> AVX1CostTable[] = { + // We don't have to scalarize unsupported ops. We can issue two half-sized + // operations and we only need to extract the upper YMM half. + // Two ops + 1 extract + 1 insert = 4. + { ISD::MUL, MVT::v8i32, 4 }, + { ISD::SUB, MVT::v8i32, 4 }, + { ISD::ADD, MVT::v8i32, 4 }, + { ISD::SUB, MVT::v4i64, 4 }, + { ISD::ADD, MVT::v4i64, 4 }, + // A v4i64 multiply is custom lowered as two split v2i64 vectors that then + // are lowered as a series of long multiplies(3), shifts(4) and adds(2) + // Because we believe v4i64 to be a legal type, we must also include the + // split factor of two in the cost table. Therefore, the cost here is 18 + // instead of 9. + { ISD::MUL, MVT::v4i64, 18 }, + }; + + // Look for AVX1 lowering tricks. + if (ST->hasAVX() && !ST->hasAVX2()) { + int Idx = CostTableLookup<MVT>(AVX1CostTable, array_lengthof(AVX1CostTable), + ISD, LT.second); + if (Idx != -1) + return LT.first * AVX1CostTable[Idx].Cost; + } + + // Custom lowering of vectors. + static const CostTblEntry<MVT> CustomLowered[] = { + // A v2i64/v4i64 and multiply is custom lowered as a series of long + // multiplies(3), shifts(4) and adds(2). + { ISD::MUL, MVT::v2i64, 9 }, + { ISD::MUL, MVT::v4i64, 9 }, + }; + int Idx = CostTableLookup<MVT>(CustomLowered, array_lengthof(CustomLowered), + ISD, LT.second); + if (Idx != -1) + return LT.first * CustomLowered[Idx].Cost; + + // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle, + // 2x pmuludq, 2x shuffle. + if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() && + !ST->hasSSE41()) + return 6; + + // Fallback to the default implementation. + return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, + Op2Info); +} + +unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp) const { + // We only estimate the cost of reverse shuffles. + if (Kind != SK_Reverse) + return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); + + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); + unsigned Cost = 1; + if (LT.second.getSizeInBits() > 128) + Cost = 3; // Extract + insert + copy. + + // Multiple by the number of parts. + return Cost * LT.first; +} + +unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(Src); + std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(Dst); + + static const TypeConversionCostTblEntry<MVT> SSE2ConvTbl[] = { + // These are somewhat magic numbers justified by looking at the output of + // Intel's IACA, running some kernels and making sure when we take + // legalization into account the throughput will be overestimated. + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, + // There are faster sequences for float conversions. + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, + }; + + if (ST->hasSSE2() && !ST->hasAVX()) { + int Idx = ConvertCostTableLookup<MVT>(SSE2ConvTbl, + array_lengthof(SSE2ConvTbl), + ISD, LTDest.second, LTSrc.second); + if (Idx != -1) + return LTSrc.first * SSE2ConvTbl[Idx].Cost; + } + + EVT SrcTy = TLI->getValueType(Src); + EVT DstTy = TLI->getValueType(Dst); + + // The function getSimpleVT only handles simple value types. + if (!SrcTy.isSimple() || !DstTy.isSimple()) + return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); + + static const TypeConversionCostTblEntry<MVT> AVXConversionTbl[] = { + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, + { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1 }, + + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, + { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, + { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 }, + { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 }, + { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, + + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, + + { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 1 }, + { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 6 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 9 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 8 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 }, + { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 }, + }; + + if (ST->hasAVX()) { + int Idx = ConvertCostTableLookup<MVT>(AVXConversionTbl, + array_lengthof(AVXConversionTbl), + ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()); + if (Idx != -1) + return AVXConversionTbl[Idx].Cost; + } + + return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); +} + +unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, + Type *CondTy) const { + // Legalize the type. + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy); + + MVT MTy = LT.second; + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + static const CostTblEntry<MVT> SSE42CostTbl[] = { + { ISD::SETCC, MVT::v2f64, 1 }, + { ISD::SETCC, MVT::v4f32, 1 }, + { ISD::SETCC, MVT::v2i64, 1 }, + { ISD::SETCC, MVT::v4i32, 1 }, + { ISD::SETCC, MVT::v8i16, 1 }, + { ISD::SETCC, MVT::v16i8, 1 }, + }; + + static const CostTblEntry<MVT> AVX1CostTbl[] = { + { ISD::SETCC, MVT::v4f64, 1 }, + { ISD::SETCC, MVT::v8f32, 1 }, + // AVX1 does not support 8-wide integer compare. + { ISD::SETCC, MVT::v4i64, 4 }, + { ISD::SETCC, MVT::v8i32, 4 }, + { ISD::SETCC, MVT::v16i16, 4 }, + { ISD::SETCC, MVT::v32i8, 4 }, + }; + + static const CostTblEntry<MVT> AVX2CostTbl[] = { + { ISD::SETCC, MVT::v4i64, 1 }, + { ISD::SETCC, MVT::v8i32, 1 }, + { ISD::SETCC, MVT::v16i16, 1 }, + { ISD::SETCC, MVT::v32i8, 1 }, + }; + + if (ST->hasAVX2()) { + int Idx = CostTableLookup<MVT>(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy); + if (Idx != -1) + return LT.first * AVX2CostTbl[Idx].Cost; + } + + if (ST->hasAVX()) { + int Idx = CostTableLookup<MVT>(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy); + if (Idx != -1) + return LT.first * AVX1CostTbl[Idx].Cost; + } + + if (ST->hasSSE42()) { + int Idx = CostTableLookup<MVT>(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy); + if (Idx != -1) + return LT.first * SSE42CostTbl[Idx].Cost; + } + + return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); +} + +unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index) const { + assert(Val->isVectorTy() && "This must be a vector type"); + + if (Index != -1U) { + // Legalize the type. + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val); + + // This type is legalized to a scalar type. + if (!LT.second.isVector()) + return 0; + + // The type may be split. Normalize the index to the new type. + unsigned Width = LT.second.getVectorNumElements(); + Index = Index % Width; + + // Floating point scalars are already located in index #0. + if (Val->getScalarType()->isFloatingPointTy() && Index == 0) + return 0; + } + + return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); +} + +unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace) const { + // Legalize the type. + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); + assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && + "Invalid Opcode"); + + // Each load/store unit costs 1. + unsigned Cost = LT.first * 1; + + // On Sandybridge 256bit load/stores are double pumped + // (but not on Haswell). + if (LT.second.getSizeInBits() > 128 && !ST->hasAVX2()) + Cost*=2; + + return Cost; +} diff --git a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp new file mode 100644 index 0000000..0f77948 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp @@ -0,0 +1,293 @@ +//===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass which inserts x86 AVX vzeroupper instructions +// before calls to SSE encoded functions. This avoids transition latency +// penalty when tranfering control between AVX encoded instructions and old +// SSE encoding mode. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "x86-vzeroupper" +#include "X86.h" +#include "X86InstrInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +using namespace llvm; + +STATISTIC(NumVZU, "Number of vzeroupper instructions inserted"); + +namespace { + struct VZeroUpperInserter : public MachineFunctionPass { + static char ID; + VZeroUpperInserter() : MachineFunctionPass(ID) {} + + virtual bool runOnMachineFunction(MachineFunction &MF); + + bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB); + + virtual const char *getPassName() const { return "X86 vzeroupper inserter";} + + private: + const TargetInstrInfo *TII; // Machine instruction info. + + // Any YMM register live-in to this function? + bool FnHasLiveInYmm; + + // BBState - Contains the state of each MBB: unknown, clean, dirty + SmallVector<uint8_t, 8> BBState; + + // BBSolved - Keep track of all MBB which had been already analyzed + // and there is no further processing required. + BitVector BBSolved; + + // Machine Basic Blocks are classified according this pass: + // + // ST_UNKNOWN - The MBB state is unknown, meaning from the entry state + // until the MBB exit there isn't a instruction using YMM to change + // the state to dirty, or one of the incoming predecessors is unknown + // and there's not a dirty predecessor between them. + // + // ST_CLEAN - No YMM usage in the end of the MBB. A MBB could have + // instructions using YMM and be marked ST_CLEAN, as long as the state + // is cleaned by a vzeroupper before any call. + // + // ST_DIRTY - Any MBB ending with a YMM usage not cleaned up by a + // vzeroupper instruction. + // + // ST_INIT - Placeholder for an empty state set + // + enum { + ST_UNKNOWN = 0, + ST_CLEAN = 1, + ST_DIRTY = 2, + ST_INIT = 3 + }; + + // computeState - Given two states, compute the resulting state, in + // the following way + // + // 1) One dirty state yields another dirty state + // 2) All states must be clean for the result to be clean + // 3) If none above and one unknown, the result state is also unknown + // + static unsigned computeState(unsigned PrevState, unsigned CurState) { + if (PrevState == ST_INIT) + return CurState; + + if (PrevState == ST_DIRTY || CurState == ST_DIRTY) + return ST_DIRTY; + + if (PrevState == ST_CLEAN && CurState == ST_CLEAN) + return ST_CLEAN; + + return ST_UNKNOWN; + } + + }; + char VZeroUpperInserter::ID = 0; +} + +FunctionPass *llvm::createX86IssueVZeroUpperPass() { + return new VZeroUpperInserter(); +} + +static bool isYmmReg(unsigned Reg) { + if (Reg >= X86::YMM0 && Reg <= X86::YMM15) + return true; + + return false; +} + +static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) { + for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(), + E = MRI.livein_end(); I != E; ++I) + if (isYmmReg(I->first)) + return true; + + return false; +} + +static bool clobbersAllYmmRegs(const MachineOperand &MO) { + for (unsigned reg = X86::YMM0; reg < X86::YMM15; ++reg) { + if (!MO.clobbersPhysReg(reg)) + return false; + } + return true; +} + +static bool hasYmmReg(MachineInstr *MI) { + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MI->isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO)) + return true; + if (!MO.isReg()) + continue; + if (MO.isDebug()) + continue; + if (isYmmReg(MO.getReg())) + return true; + } + return false; +} + +/// runOnMachineFunction - Loop over all of the basic blocks, inserting +/// vzero upper instructions before function calls. +bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { + TII = MF.getTarget().getInstrInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + bool EverMadeChange = false; + + // Fast check: if the function doesn't use any ymm registers, we don't need + // to insert any VZEROUPPER instructions. This is constant-time, so it is + // cheap in the common case of no ymm use. + bool YMMUsed = false; + const TargetRegisterClass *RC = &X86::VR256RegClass; + for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); + i != e; i++) { + if (!MRI.reg_nodbg_empty(*i)) { + YMMUsed = true; + break; + } + } + if (!YMMUsed) + return EverMadeChange; + + // Pre-compute the existence of any live-in YMM registers to this function + FnHasLiveInYmm = checkFnHasLiveInYmm(MRI); + + assert(BBState.empty()); + BBState.resize(MF.getNumBlockIDs(), 0); + BBSolved.resize(MF.getNumBlockIDs(), 0); + + // Each BB state depends on all predecessors, loop over until everything + // converges. (Once we converge, we can implicitly mark everything that is + // still ST_UNKNOWN as ST_CLEAN.) + while (1) { + bool MadeChange = false; + + // Process all basic blocks. + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) + MadeChange |= processBasicBlock(MF, *I); + + // If this iteration over the code changed anything, keep iterating. + if (!MadeChange) break; + EverMadeChange = true; + } + + BBState.clear(); + BBSolved.clear(); + return EverMadeChange; +} + +/// processBasicBlock - Loop over all of the instructions in the basic block, +/// inserting vzero upper instructions before function calls. +bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, + MachineBasicBlock &BB) { + bool Changed = false; + unsigned BBNum = BB.getNumber(); + + // Don't process already solved BBs + if (BBSolved[BBNum]) + return false; // No changes + + // Check the state of all predecessors + unsigned EntryState = ST_INIT; + for (MachineBasicBlock::const_pred_iterator PI = BB.pred_begin(), + PE = BB.pred_end(); PI != PE; ++PI) { + EntryState = computeState(EntryState, BBState[(*PI)->getNumber()]); + if (EntryState == ST_DIRTY) + break; + } + + + // The entry MBB for the function may set the initial state to dirty if + // the function receives any YMM incoming arguments + if (&BB == MF.begin()) { + EntryState = ST_CLEAN; + if (FnHasLiveInYmm) + EntryState = ST_DIRTY; + } + + // The current state is initialized according to the predecessors + unsigned CurState = EntryState; + bool BBHasCall = false; + + for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { + MachineInstr *MI = I; + DebugLoc dl = I->getDebugLoc(); + bool isControlFlow = MI->isCall() || MI->isReturn(); + + // Shortcut: don't need to check regular instructions in dirty state. + if (!isControlFlow && CurState == ST_DIRTY) + continue; + + if (hasYmmReg(MI)) { + // We found a ymm-using instruction; this could be an AVX instruction, + // or it could be control flow. + CurState = ST_DIRTY; + continue; + } + + // Check for control-flow out of the current function (which might + // indirectly execute SSE instructions). + if (!isControlFlow) + continue; + + BBHasCall = true; + + // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX + // registers. This instruction has zero latency. In addition, the processor + // changes back to Clean state, after which execution of Intel SSE + // instructions or Intel AVX instructions has no transition penalty. Add + // the VZEROUPPER instruction before any function call/return that might + // execute SSE code. + // FIXME: In some cases, we may want to move the VZEROUPPER into a + // predecessor block. + if (CurState == ST_DIRTY) { + // Only insert the VZEROUPPER in case the entry state isn't unknown. + // When unknown, only compute the information within the block to have + // it available in the exit if possible, but don't change the block. + if (EntryState != ST_UNKNOWN) { + BuildMI(BB, I, dl, TII->get(X86::VZEROUPPER)); + ++NumVZU; + } + + // After the inserted VZEROUPPER the state becomes clean again, but + // other YMM may appear before other subsequent calls or even before + // the end of the BB. + CurState = ST_CLEAN; + } + } + + DEBUG(dbgs() << "MBB #" << BBNum + << ", current state: " << CurState << '\n'); + + // A BB can only be considered solved when we both have done all the + // necessary transformations, and have computed the exit state. This happens + // in two cases: + // 1) We know the entry state: this immediately implies the exit state and + // all the necessary transformations. + // 2) There are no calls, and and a non-call instruction marks this block: + // no transformations are necessary, and we know the exit state. + if (EntryState != ST_UNKNOWN || (!BBHasCall && CurState != ST_UNKNOWN)) + BBSolved[BBNum] = true; + + if (CurState != BBState[BBNum]) + Changed = true; + + BBState[BBNum] = CurState; + return Changed; +} |